plaso-rubanetra/plaso/parsers/oxml.py
2020-04-06 18:48:34 +02:00

184 lines
6.3 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains a parser for OXML files (i.e. MS Office 2007+)."""
import logging
import re
import struct
import zipfile
from xml.etree import ElementTree
from plaso.events import time_events
from plaso.lib import errors
from plaso.lib import eventdata
from plaso.lib import timelib
from plaso.parsers import interface
from plaso.parsers import manager
__author__ = 'David Nides (david.nides@gmail.com)'
class OpenXMLParserEvent(time_events.TimestampEvent):
"""Process timestamps from MS Office XML Events."""
DATA_TYPE = 'metadata:openxml'
def __init__(self, timestamp_string, usage, metadata):
"""Initializes the event object.
Args:
timestamp_string: An ISO 8601 representation of a timestamp.
usage: The description of the usage of the time value.
metadata: A dict object containing extracted metadata.
"""
timestamp = timelib.Timestamp.FromTimeString(timestamp_string)
super(OpenXMLParserEvent, self).__init__(timestamp, usage, self.DATA_TYPE)
for key, value in metadata.iteritems():
setattr(self, key, value)
class OpenXMLParser(interface.BaseParser):
"""Parse metadata from OXML files."""
NAME = 'openxml'
DESCRIPTION = u'Parser for OpenXML (OXML) files.'
_METAKEY_TRANSLATE = {
'creator': 'author',
'lastModifiedBy': 'last_saved_by',
'Total_Time': 'total_edit_time',
'Pages': 'num_pages',
'Characters_with_spaces': 'num_chars_w_spaces',
'Paragraphs': 'num_paragraphs',
'Characters': 'num_chars',
'Lines': 'num_lines',
'revision': 'revision_num',
'Words': 'num_words',
'Application': 'creating_app',
'Shared_Doc': 'shared',
}
_FILES_REQUIRED = frozenset([
'[Content_Types].xml', '_rels/.rels', 'docProps/core.xml'])
def _FixString(self, key):
"""Convert CamelCase to lower_with_underscore."""
# TODO: Add unicode support.
fix_key = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', key)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', fix_key).lower()
def Parse(self, parser_context, file_entry, parser_chain=None):
"""Extract data from an OXML file.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
"""
file_object = file_entry.GetFileObject()
if not zipfile.is_zipfile(file_object):
raise errors.UnableToParseFile(
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
self.NAME, file_entry.name, 'Not a Zip file.'))
try:
zip_container = zipfile.ZipFile(file_object, 'r')
except (zipfile.BadZipfile, struct.error, zipfile.LargeZipFile):
raise errors.UnableToParseFile(
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
self.NAME, file_entry.name, 'Bad Zip file.'))
zip_name_list = set(zip_container.namelist())
if not self._FILES_REQUIRED.issubset(zip_name_list):
raise errors.UnableToParseFile(
u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format(
self.NAME, file_entry.name, 'OXML element(s) missing.'))
# Add ourselves to the parser chain, which will be used in all subsequent
# event creation in this parser.
parser_chain = self._BuildParserChain(parser_chain)
metadata = {}
timestamps = {}
try:
rels_xml = zip_container.read('_rels/.rels')
except zipfile.BadZipfile as exception:
logging.error(
u'Unable to parse file {0:s} with error: {1:s}'.format(
file_entry.name, exception))
return
rels_root = ElementTree.fromstring(rels_xml)
for properties in rels_root.iter():
if 'properties' in repr(properties.get('Type')):
try:
xml = zip_container.read(properties.get('Target'))
root = ElementTree.fromstring(xml)
except (
OverflowError, IndexError, KeyError, ValueError,
zipfile.BadZipfile) as exception:
logging.warning(
u'[{0:s}] unable to read property with error: {1:s}.'.format(
self.NAME, exception))
continue
for element in root.iter():
if element.text:
_, _, tag = element.tag.partition('}')
# Not including the 'lpstr' attribute because it is
# very verbose.
if tag == 'lpstr':
continue
if tag in ('created', 'modified', 'lastPrinted'):
timestamps[tag] = element.text
else:
tag_name = self._METAKEY_TRANSLATE.get(tag, self._FixString(tag))
metadata[tag_name] = element.text
if timestamps.get('created', None):
event_object = OpenXMLParserEvent(
timestamps.get('created'), eventdata.EventTimestamp.CREATION_TIME,
metadata)
parser_context.ProduceEvent(
event_object, parser_chain=parser_chain, file_entry=file_entry)
if timestamps.get('modified', None):
event_object = OpenXMLParserEvent(
timestamps.get('modified'),
eventdata.EventTimestamp.MODIFICATION_TIME, metadata)
parser_context.ProduceEvent(
event_object, parser_chain=parser_chain, file_entry=file_entry)
if timestamps.get('lastPrinted', None):
event_object = OpenXMLParserEvent(
timestamps.get('lastPrinted'), eventdata.EventTimestamp.LAST_PRINTED,
metadata)
parser_context.ProduceEvent(
event_object, parser_chain=parser_chain, file_entry=file_entry)
manager.ParsersManager.RegisterParser(OpenXMLParser)