184 lines
6.3 KiB
Python
184 lines
6.3 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2013 The Plaso Project Authors.
|
|
# Please see the AUTHORS file for details on individual authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""This file contains a parser for OXML files (i.e. MS Office 2007+)."""
|
|
|
|
import logging
|
|
import re
|
|
import struct
|
|
import zipfile
|
|
|
|
from xml.etree import ElementTree
|
|
|
|
from plaso.events import time_events
|
|
from plaso.lib import errors
|
|
from plaso.lib import eventdata
|
|
from plaso.lib import timelib
|
|
from plaso.parsers import interface
|
|
from plaso.parsers import manager
|
|
|
|
|
|
__author__ = 'David Nides (david.nides@gmail.com)'
|
|
|
|
|
|
class OpenXMLParserEvent(time_events.TimestampEvent):
|
|
"""Process timestamps from MS Office XML Events."""
|
|
|
|
DATA_TYPE = 'metadata:openxml'
|
|
|
|
def __init__(self, timestamp_string, usage, metadata):
|
|
"""Initializes the event object.
|
|
|
|
Args:
|
|
timestamp_string: An ISO 8601 representation of a timestamp.
|
|
usage: The description of the usage of the time value.
|
|
metadata: A dict object containing extracted metadata.
|
|
"""
|
|
timestamp = timelib.Timestamp.FromTimeString(timestamp_string)
|
|
super(OpenXMLParserEvent, self).__init__(timestamp, usage, self.DATA_TYPE)
|
|
for key, value in metadata.iteritems():
|
|
setattr(self, key, value)
|
|
|
|
|
|
class OpenXMLParser(interface.BaseParser):
|
|
"""Parse metadata from OXML files."""
|
|
|
|
NAME = 'openxml'
|
|
DESCRIPTION = u'Parser for OpenXML (OXML) files.'
|
|
|
|
_METAKEY_TRANSLATE = {
|
|
'creator': 'author',
|
|
'lastModifiedBy': 'last_saved_by',
|
|
'Total_Time': 'total_edit_time',
|
|
'Pages': 'num_pages',
|
|
'Characters_with_spaces': 'num_chars_w_spaces',
|
|
'Paragraphs': 'num_paragraphs',
|
|
'Characters': 'num_chars',
|
|
'Lines': 'num_lines',
|
|
'revision': 'revision_num',
|
|
'Words': 'num_words',
|
|
'Application': 'creating_app',
|
|
'Shared_Doc': 'shared',
|
|
}
|
|
|
|
_FILES_REQUIRED = frozenset([
|
|
'[Content_Types].xml', '_rels/.rels', 'docProps/core.xml'])
|
|
|
|
def _FixString(self, key):
|
|
"""Convert CamelCase to lower_with_underscore."""
|
|
# TODO: Add unicode support.
|
|
fix_key = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', key)
|
|
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', fix_key).lower()
|
|
|
|
def Parse(self, parser_context, file_entry, parser_chain=None):
|
|
"""Extract data from an OXML file.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
"""
|
|
file_object = file_entry.GetFileObject()
|
|
|
|
if not zipfile.is_zipfile(file_object):
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
|
|
self.NAME, file_entry.name, 'Not a Zip file.'))
|
|
|
|
try:
|
|
zip_container = zipfile.ZipFile(file_object, 'r')
|
|
except (zipfile.BadZipfile, struct.error, zipfile.LargeZipFile):
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
|
|
self.NAME, file_entry.name, 'Bad Zip file.'))
|
|
|
|
zip_name_list = set(zip_container.namelist())
|
|
|
|
if not self._FILES_REQUIRED.issubset(zip_name_list):
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
|
|
self.NAME, file_entry.name, 'OXML element(s) missing.'))
|
|
|
|
# Add ourselves to the parser chain, which will be used in all subsequent
|
|
# event creation in this parser.
|
|
parser_chain = self._BuildParserChain(parser_chain)
|
|
|
|
metadata = {}
|
|
timestamps = {}
|
|
|
|
try:
|
|
rels_xml = zip_container.read('_rels/.rels')
|
|
except zipfile.BadZipfile as exception:
|
|
logging.error(
|
|
u'Unable to parse file {0:s} with error: {1:s}'.format(
|
|
file_entry.name, exception))
|
|
return
|
|
|
|
rels_root = ElementTree.fromstring(rels_xml)
|
|
|
|
for properties in rels_root.iter():
|
|
if 'properties' in repr(properties.get('Type')):
|
|
try:
|
|
xml = zip_container.read(properties.get('Target'))
|
|
root = ElementTree.fromstring(xml)
|
|
except (
|
|
OverflowError, IndexError, KeyError, ValueError,
|
|
zipfile.BadZipfile) as exception:
|
|
logging.warning(
|
|
u'[{0:s}] unable to read property with error: {1:s}.'.format(
|
|
self.NAME, exception))
|
|
continue
|
|
|
|
for element in root.iter():
|
|
if element.text:
|
|
_, _, tag = element.tag.partition('}')
|
|
# Not including the 'lpstr' attribute because it is
|
|
# very verbose.
|
|
if tag == 'lpstr':
|
|
continue
|
|
|
|
if tag in ('created', 'modified', 'lastPrinted'):
|
|
timestamps[tag] = element.text
|
|
else:
|
|
tag_name = self._METAKEY_TRANSLATE.get(tag, self._FixString(tag))
|
|
metadata[tag_name] = element.text
|
|
|
|
if timestamps.get('created', None):
|
|
event_object = OpenXMLParserEvent(
|
|
timestamps.get('created'), eventdata.EventTimestamp.CREATION_TIME,
|
|
metadata)
|
|
parser_context.ProduceEvent(
|
|
event_object, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
if timestamps.get('modified', None):
|
|
event_object = OpenXMLParserEvent(
|
|
timestamps.get('modified'),
|
|
eventdata.EventTimestamp.MODIFICATION_TIME, metadata)
|
|
parser_context.ProduceEvent(
|
|
event_object, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
if timestamps.get('lastPrinted', None):
|
|
event_object = OpenXMLParserEvent(
|
|
timestamps.get('lastPrinted'), eventdata.EventTimestamp.LAST_PRINTED,
|
|
metadata)
|
|
parser_context.ProduceEvent(
|
|
event_object, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
|
|
manager.ParsersManager.RegisterParser(OpenXMLParser)
|