Import from old repository

This commit is contained in:
Stefan
2020-04-06 18:48:34 +02:00
commit 0da6783a45
762 changed files with 103065 additions and 0 deletions
+17
View File
@@ -0,0 +1,17 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+332
View File
@@ -0,0 +1,332 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains a foreman class for monitoring workers."""
import collections
import logging
from plaso.multi_processing import process_info
class Foreman(object):
"""A foreman class that monitors workers.
The Foreman is responsible for monitoring worker processes
and give back status information. The status information contains
among other things:
+ Number of events extracted from each worker.
+ Path of the current file the worker is processing.
+ Indications whether the worker is alive or not.
+ Memory consumption of the worker.
This information is gathered using both RPC calls to the worker
itself as well as data provided by the psutil library.
In the future the Foreman should be able to actively monitor
the health of the processes and terminate and restart processes
that are stuck.
"""
PROCESS_LABEL = collections.namedtuple('process_label', 'label pid process')
def __init__(self, show_memory_usage=False):
"""Initialize the foreman process.
Args:
show_memory_usage: Optional boolean value to indicate memory information
should be included in logging. The default is false.
"""
self._last_status_dict = {}
self._process_information = process_info.ProcessInfo()
self._process_labels = []
self._processing_done = False
self._show_memory_usage = show_memory_usage
@property
def labels(self):
"""Return a list of all currently watched labels."""
return self._process_labels
@property
def number_of_processes_in_watch_list(self):
"""Return the number of processes in the watch list."""
return len(self._process_labels)
def CheckStatus(self, label=None):
"""Checks status of either a single process or all from the watch list.
Args:
label: A process label (instance of PROCESS_LABEL), if not provided
all processes from the watch list are checked. Defaults to None.
"""
if label is not None:
self._CheckStatus(label)
return
for process_label in self._process_labels:
self._CheckStatus(process_label)
def GetLabel(self, name=None, pid=None):
"""Return a label if found using either name or PID value.
Args:
name: String value that should match an already existing label.
pid: A process ID (PID) value for a process that is monitored.
Returns:
A label (instance of PROCESS_LABEL) if found. If neither name
nor pid value is given or the process does not exist a None value
will be returned.
"""
if name is not None:
for process_label in self._process_labels:
if process_label.label == name:
return process_label
if pid is not None:
for process_label in self._process_labels:
if process_label.pid == pid:
return process_label
def MonitorWorker(self, label=None, pid=None, name=None):
"""Starts monitoring a worker by adding it to the monitor list.
This function requires either a label to be set or a PID and a process
name. If the label is empty or if both a PID and a name is not provided
the function does nothing, as in no process is added to the list of
workers to monitor (and no indication).
Args:
label: A process label (instance of PROCESS_LABEL), if not provided
then a pid and a name is required. Defaults to None (if None
then both a pid and name have to be provided).
pid: The process ID (PID) of the worker that should be added to the
monitor list. This is only required if label is not provided.
Defaults to None. This is only used if label is set to None, in
which case it has to be set.
name: The name of the worker process, only required if label is not
provided. Defaults to None, only used if label is set to None,
in which case it has to be set.
"""
if label is None:
if pid is None or name is None:
return
label = self.PROCESS_LABEL(name, pid, process_info.ProcessInfo(pid=pid))
if not label:
return
if label not in self._process_labels:
self._process_labels.append(label)
def StopMonitoringWorker(self, label=None, pid=None, name=None):
"""Stop monitoring a particular worker and remove it from monitor list.
The purpose of this function is to remove a worker from the list of
monitored workers. In order to do that the function requires either a
label or a pid and a name.
Args:
label: A process label (instance of PROCESS_LABEL). Defaults to None, and
so then a pid and name are required.
pid: The process ID (PID) of the worker that should no longer be
monitored. This is only required if label is not provided and
defaults to None.
name: The name of the worker process, defaults to None and is only
required if label is not set.
"""
if label is None:
if pid is None or name is None:
return
label = self.PROCESS_LABEL(
name, pid, process_info.ProcessInfo(pid=pid))
if label not in self._process_labels:
return
index = self._process_labels.index(label)
del self._process_labels[index]
logging.info(
u'{0:s} [{1:d}] has been removed from foreman monitoring.'.format(
label.label, label.pid))
def SignalEndOfProcessing(self):
"""Indicate that processing is done."""
self._processing_done = True
# TODO: Reconsider this as an info signal. Should this not be moved to
# a debug one?
logging.info(
u'Foreman received a signal indicating that processing is completed.')
# This function may be called via RPC functions that expects a value to be
# returned.
return True
def TerminateProcess(self, label=None, pid=None, name=None):
"""Terminate a process, even if it is not in the watch list.
Args:
label: A process label (instance of PROCESS_LABEL), if not provided
then a pid and a name is required. It defaults to None, in which
case you need to provide a pid and/or a name.
pid: The process ID (PID) of the worker. This is only required if label
is not provided and defaults to None.
name: The name of the worker process, only required if label is not
provided and defaults to None.
"""
if label is not None:
self._TerminateProcess(label)
return
if pid is not None:
for process_label in self._process_labels:
if process_label.pid == pid:
self._TerminateProcess(process_label)
return
if name is not None:
for process_label in self._process_labels:
if process_label.label == name:
self._TerminateProcess(process_label)
return
# If we reach here the process is not in our watch list.
if pid is not None and name is not None:
process_label = self.PROCESS_LABEL(
name, pid, process_info.ProcessInfo(pid=pid))
self._TerminateProcess(process_label)
def _CheckStatus(self, label):
"""Check status for a single process from the watch list.
This function will take a single label, which describes a worker process
and check if it is alive, call the appropriate functions to log down
information extracted from the worker and if a process is no longer alive
and processing has been marked as done, it will remove the worker from
the list of monitored workers. This function is also reponsible for killing
or terminating a process that is alive and hanging, or not alive while
it should be alive.
In the future this function will also be responsible for restarting
a worker, or signalling the engine that it needs to spin up a new worker
in the case of a worker dying or being in an effective zombie state.
Args:
label: A process label (instance of PROCESS_LABEL).
"""
if label not in self._process_labels:
return
process = label.process
if process.IsAlive():
status_dict = process.GetProcessStatus()
if not status_dict and not self._processing_done:
logging.warning((
u'Unable to connect to RPC socket to: {0:s} at '
u'http://localhost:{1:d}').format(label.label, label.pid))
if status_dict:
self._last_status_dict[label.pid] = status_dict
if status_dict.get('is_running', False):
self._LogWorkerInformation(label, status_dict)
if self._show_memory_usage:
self._LogMemoryUsage(label)
return
else:
logging.info(
u'Process {0:s} [{1:d}] has complete it\'s processing. Total of '
u'{2:d} events extracted'.format(
label.label, label.pid, status_dict.get('counter', 0)))
else:
logging.info(u'Process {0:s} [{1:d}] is not alive.'.format(
label.label, label.pid))
# Check if this process should be alive.
if self._processing_done:
# This process exited properly and should have. Let's remove it from our
# list of labels.
self.StopMonitoringWorker(label=label)
return
# We need to terminate the process.
# TODO: Add a function to start a new instance of a worker instead of
# just removing and killing it.
logging.error(
u'Process {0:s} [{1:d}] is not functioning when it should be. '
u'Terminating it and removing from list.'.format(
label.label, label.pid))
self._TerminateProcess(label)
def _LogMemoryUsage(self, label):
"""Logs memory information gathered from a process.
This function will take a label and call the logging infrastructure to
log information about the process's memory information.
Args:
label: A process label (instance of PROCESS_LABEL).
"""
mem_info = label.process.GetMemoryInformation()
logging.info((
u'{0:s} - RSS: {1:d}, VMS: {2:d}, Shared: {3:d}, Text: {4:d}, lib: '
u'{5:d}, data: {6:d}, dirty: {7:d}, Memory Percent: {8:0.2f}%').format(
label.label, mem_info.rss, mem_info.vms, mem_info.shared,
mem_info.text, mem_info.lib, mem_info.data, mem_info.dirty,
mem_info.percent * 100))
def _LogWorkerInformation(self, label, status=None):
"""Log information gathered from the worker.
Args:
label: A process label (instance of PROCESS_LABEL).
"""
if status:
logging.info((
u'{0:s} [{1:d}] - Events Extracted: {2:d} - File ({3:s}) - Running: '
u'{4!s} <{5:s}>').format(
label.label, label.pid, status.get('counter', -1),
status.get('current_file', u''), status.get('is_running', False),
unicode(label.process.status)))
def _TerminateProcess(self, label):
"""Terminate a process given a process label.
Attempts to terminate a process and if successful
removes the label from the watch list.
Args:
label: A process label (instance of PROCESS_LABEL).
"""
if label is None:
return
label.process.TerminateProcess()
# Double check the process is dead.
if label.process.IsAlive():
logging.warning(u'Process {0:s} [{1:d}] is still alive.'.format(
label.label, label.pid))
elif label.process.status != 'exited':
logging.warning(u'Process {0:s} [{1:d}] may still be alive.'.format(
label.label, label.pid))
else:
logging.info(u'Process: {0:s} [{1:d}] has been terminated.'.format(
label.label, label.pid))
self.StopMonitoringWorker(label)
+700
View File
@@ -0,0 +1,700 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The multi-process processing engine."""
import ctypes
import logging
import multiprocessing
import os
import signal
import sys
import threading
from plaso.engine import collector
from plaso.engine import engine
from plaso.engine import queue
from plaso.engine import worker
from plaso.lib import errors
from plaso.multi_processing import foreman
from plaso.multi_processing import rpc_proxy
from plaso.parsers import context as parsers_context
def SigKill(pid):
"""Convenience function to issue a SIGKILL or equivalent.
Args:
pid: The process identifier.
"""
if sys.platform.startswith('win'):
process_terminate = 1
handle = ctypes.windll.kernel32.OpenProcess(
process_terminate, False, pid)
ctypes.windll.kernel32.TerminateProcess(handle, -1)
ctypes.windll.kernel32.CloseHandle(handle)
else:
try:
os.kill(pid, signal.SIGKILL)
except OSError as exception:
logging.error(
u'Unable to kill process {0:d} with error: {1:s}'.format(
pid, exception))
class MultiProcessEngine(engine.BaseEngine):
"""Class that defines the multi-process engine."""
_WORKER_PROCESSES_MINIMUM = 2
_WORKER_PROCESSES_MAXIMUM = 15
def __init__(self, maximum_number_of_queued_items=0):
"""Initialize the multi-process engine object.
Args:
maximum_number_of_queued_items: The maximum number of queued items.
The default is 0, which represents
no limit.
"""
collection_queue = MultiProcessingQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
storage_queue = MultiProcessingQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
parse_error_queue = MultiProcessingQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
super(MultiProcessEngine, self).__init__(
collection_queue, storage_queue, parse_error_queue)
self._collection_process = None
self._foreman_object = None
self._storage_process = None
# TODO: turn into a process pool.
self._worker_processes = {}
# Attributes for RPC proxy server thread.
self._proxy_thread = None
self._rpc_proxy_server = None
self._rpc_port_number = 0
def _StartRPCProxyServerThread(self, foreman_object):
"""Starts the RPC proxy server thread.
Args:
foreman_object: a foreman object (instance of Foreman).
"""
if self._rpc_proxy_server or self._proxy_thread:
return
self._rpc_proxy_server = rpc_proxy.StandardRpcProxyServer(os.getpid())
try:
self._rpc_proxy_server.Open()
self._rpc_proxy_server.RegisterFunction(
'signal_end_of_collection', foreman_object.SignalEndOfProcessing)
self._proxy_thread = threading.Thread(
name='rpc_proxy', target=self._rpc_proxy_server.StartProxy)
self._proxy_thread.start()
self._rpc_port_number = self._rpc_proxy_server.listening_port
except errors.ProxyFailedToStart as exception:
logging.error((
u'Unable to setup a RPC server for the engine with error '
u'{0:s}').format(exception))
def _StopRPCProxyServerThread(self):
"""Stops the RPC proxy server thread."""
if not self._rpc_proxy_server or not self._proxy_thread:
return
# Close the proxy, free up resources so we can shut down the thread.
self._rpc_proxy_server.Close()
if self._proxy_thread.isAlive():
self._proxy_thread.join()
self._proxy_thread = None
self._rpc_proxy_server = None
self._rpc_port_number = 0
def CreateCollector(
self, include_directory_stat, vss_stores=None, filter_find_specs=None,
resolver_context=None):
"""Creates a collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
include_directory_stat: Boolean value to indicate whether directory
stat information should be collected.
vss_stores: Optional list of VSS stores to include in the collection,
where 1 represents the first store. Set to None if no
VSS stores should be processed. The default is None.
filter_find_specs: Optional list of filter find specifications (instances
of dfvfs.FindSpec). The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
Returns:
A collector object (instance of Collector).
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_path_spec:
raise RuntimeError(u'Missing source.')
collector_object = collector.Collector(
self._collection_queue, self._source, self._source_path_spec,
resolver_context=resolver_context)
collector_object.SetCollectDirectoryMetadata(include_directory_stat)
if vss_stores:
collector_object.SetVssInformation(vss_stores)
if filter_find_specs:
collector_object.SetFilter(filter_find_specs)
return collector_object
def CreateExtractionWorker(self, worker_number):
"""Creates an extraction worker object.
Args:
worker_number: A number that identifies the worker.
Returns:
An extraction worker (instance of worker.ExtractionWorker).
"""
parser_context = parsers_context.ParserContext(
self._event_queue_producer, self._parse_error_queue_producer,
self.knowledge_base)
extraction_worker = worker.BaseEventExtractionWorker(
worker_number, self._collection_queue, self._event_queue_producer,
self._parse_error_queue_producer, parser_context)
extraction_worker.SetEnableDebugOutput(self._enable_debug_output)
# TODO: move profiler in separate object.
extraction_worker.SetEnableProfiling(
self._enable_profiling,
profiling_sample_rate=self._profiling_sample_rate)
if self._open_files:
extraction_worker.SetOpenFiles(self._open_files)
if self._filter_object:
extraction_worker.SetFilterObject(self._filter_object)
if self._mount_path:
extraction_worker.SetMountPath(self._mount_path)
if self._text_prepend:
extraction_worker.SetTextPrepend(self._text_prepend)
return extraction_worker
def ProcessSource(
self, collector_object, storage_writer, parser_filter_string=None,
number_of_extraction_workers=0, have_collection_process=True,
have_foreman_process=True, show_memory_usage=False):
"""Processes the source and extracts event objects.
Args:
collector_object: A collector object (instance of Collector).
storage_writer: A storage writer object (instance of BaseStorageWriter).
parser_filter_string: Optional parser filter string. The default is None.
number_of_extraction_workers: Optional number of extraction worker
processes. The default is 0 which means
the function will determine the suitable
number.
have_collection_process: Optional boolean value to indidate a separate
collection process should be run. The default
is true.
have_foreman_process: Optional boolean value to indidate a separate
foreman process should be run to make sure the
workers are extracting event objects. The default
is true.
show_memory_usage: Optional boolean value to indicate memory information
should be included in logging. The default is false.
"""
if number_of_extraction_workers < 1:
# One worker for each "available" CPU (minus other processes).
# The number here is derived from the fact that the engine starts up:
# + A collector process (optional).
# + A storage process.
#
# If we want to utilize all CPUs on the system we therefore need to start
# up workers that amounts to the total number of CPUs - the other
# processes.
cpu_count = multiprocessing.cpu_count() - 2
if have_collection_process:
cpu_count -= 1
if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
cpu_count = self._WORKER_PROCESSES_MINIMUM
elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
cpu_count = self._WORKER_PROCESSES_MAXIMUM
number_of_extraction_workers = cpu_count
if have_foreman_process:
self._foreman_object = foreman.Foreman(
show_memory_usage=show_memory_usage)
self._StartRPCProxyServerThread(self._foreman_object)
self._storage_process = MultiProcessStorageProcess(
storage_writer, name='StorageProcess')
self._storage_process.start()
if have_collection_process:
self._collection_process = MultiProcessCollectionProcess(
collector_object, self._rpc_port_number, name='CollectionProcess')
self._collection_process.start()
logging.info(u'Starting extraction worker processes.')
for worker_number in range(number_of_extraction_workers):
extraction_worker = self.CreateExtractionWorker(worker_number)
worker_name = u'Worker_{0:d}'.format(worker_number)
# TODO: Test to see if a process pool can be a better choice.
worker_process = MultiProcessEventExtractionWorkerProcess(
extraction_worker, parser_filter_string, name=worker_name)
worker_process.start()
if self._foreman_object:
self._foreman_object.MonitorWorker(
pid=worker_process.pid, name=worker_name)
self._worker_processes[worker_name] = worker_process
logging.debug(u'Collection started.')
if not self._collection_process:
collector_object.Collect()
else:
while self._collection_process.is_alive():
self._collection_process.join(timeout=10)
# Check the worker status regularly while collection is still ongoing.
if self._foreman_object:
self._foreman_object.CheckStatus()
# TODO: We get a signal when collection is done, which might happen
# before the collection thread joins. Look at the option of speeding
# up the process of the collector stopping by potentially killing it.
logging.info(u'Collection stopped.')
self._StopProcessing()
def _StopProcessing(self):
"""Stops the foreman and worker processes."""
if self._foreman_object:
self._foreman_object.SignalEndOfProcessing()
self._StopRPCProxyServerThread()
# Run through the running workers, one by one.
# This will go through a list of all active worker processes and check it's
# status. If a worker has completed it will be removed from the list.
# The process will not wait longer than five seconds for each worker to
# complete, if longer time passes it will simply check it's status and
# move on. That ensures that worker process is monitored and status is
# updated.
while self._worker_processes:
# Note that self._worker_processes is altered in this loop hence we need
# it to be sorted.
for process_name, process_obj in sorted(self._worker_processes.items()):
if self._foreman_object:
worker_label = self._foreman_object.GetLabel(
name=process_name, pid=process_obj.pid)
else:
worker_label = None
if not worker_label:
if process_obj.is_alive():
logging.info((
u'Process {0:s} [{1:d}] is not monitored by the foreman. Most '
u'likely due to a worker having completed it\'s processing '
u'while waiting for another worker to complete.').format(
process_name, process_obj.pid))
logging.info(
u'Waiting for worker {0:s} to complete.'.format(process_name))
process_obj.join()
logging.info(u'Worker: {0:s} [{1:d}] has completed.'.format(
process_name, process_obj.pid))
del self._worker_processes[process_name]
continue
if process_obj.is_alive():
# Check status of worker.
self._foreman_object.CheckStatus(label=worker_label)
process_obj.join(timeout=5)
# Note that we explicitly must test against exitcode 0 here since
# process.exitcode will be None if there is no exitcode.
elif process_obj.exitcode != 0:
logging.warning((
u'Worker process: {0:s} already exited with code: '
u'{1:d}.').format(process_name, process_obj.exitcode))
process_obj.terminate()
self._foreman_object.TerminateProcess(label=worker_label)
else:
# Process is no longer alive, no need to monitor.
self._foreman_object.StopMonitoringWorker(label=worker_label)
# Remove it from our list of active workers.
del self._worker_processes[process_name]
if self._foreman_object:
self._foreman_object = None
logging.info(u'Extraction workers stopped.')
self._event_queue_producer.SignalEndOfInput()
self._storage_process.join()
logging.info(u'Storage writer stopped.')
def _AbortNormal(self, timeout=None):
"""Abort in a normal way.
Args:
timeout: The process join timeout. The default is None meaning
no timeout.
"""
if self._collection_process:
logging.warning(u'Signaling collection process to abort.')
self._collection_process.SignalAbort()
if self._worker_processes:
logging.warning(u'Signaling worker processes to abort.')
for _, worker_process in self._worker_processes.iteritems():
worker_process.SignalAbort()
logging.warning(u'Signaling storage process to abort.')
self._event_queue_producer.SignalEndOfInput()
self._storage_process.SignalAbort()
if self._collection_process:
logging.warning(u'Waiting for collection process: {0:d}.'.format(
self._collection_process.pid))
# TODO: it looks like xmlrpclib.ServerProxy is not allowing the
# collection process to close.
self._collection_process.join(timeout=timeout)
if self._worker_processes:
for worker_name, worker_process in self._worker_processes.iteritems():
logging.warning(u'Waiting for worker: {0:s} process: {1:d}'.format(
worker_name, worker_process.pid))
worker_process.join(timeout=timeout)
if self._storage_process:
logging.warning(u'Waiting for storage process: {0:d}.'.format(
self._collection_process.pid))
self._storage_process.join(timeout=timeout)
def _AbortTerminate(self):
"""Abort processing by sending SIGTERM or equivalent."""
if self._collection_process and self._collection_process.is_alive():
logging.warning(u'Terminating collection process: {0:d}.'.format(
self._collection_process.pid))
self._collection_process.terminate()
if self._worker_processes:
for worker_name, worker_process in self._worker_processes.iteritems():
if worker_process.is_alive():
logging.warning(u'Terminating worker: {0:s} process: {1:d}'.format(
worker_name, worker_process.pid))
worker_process.terminate()
if self._storage_process and self._storage_process.is_alive():
logging.warning(u'Terminating storage process: {0:d}.'.format(
self._storage_process.pid))
self._storage_process.terminate()
def _AbortKill(self):
"""Abort processing by sending SIGKILL or equivalent."""
if self._collection_process and self._collection_process.is_alive():
logging.warning(u'Killing collection process: {0:d}.'.format(
self._collection_process.pid))
SigKill(self._collection_process.pid)
if self._worker_processes:
for worker_name, worker_process in self._worker_processes.iteritems():
if worker_process.is_alive():
logging.warning(u'Killing worker: {0:s} process: {1:d}'.format(
worker_name, worker_process.pid))
SigKill(worker_process.pid)
if self._storage_process and self._storage_process.is_alive():
logging.warning(u'Killing storage process: {0:d}.'.format(
self._storage_process.pid))
SigKill(self._storage_process.pid)
def SignalAbort(self):
"""Signals the engine to abort."""
super(MultiProcessEngine, self).SignalAbort()
try:
self._AbortNormal(timeout=2)
self._AbortTerminate()
except KeyboardInterrupt:
self._AbortKill()
# TODO: remove the need for this.
# Sometimes the main process will be unresponsive.
SigKill(os.getpid())
class MultiProcessCollectionProcess(multiprocessing.Process):
"""Class that defines a multi-processing collection process."""
def __init__(self, collector_object, rpc_port_number, **kwargs):
"""Initializes the process object.
Args:
collector_object: A collector object (instance of Collector).
rpc_port_number: An integer value containing the RPC end point port
number or 0 if not set.
"""
super(MultiProcessCollectionProcess, self).__init__(**kwargs)
self._collector_object = collector_object
self._rpc_port_number = rpc_port_number
# This method part of the multiprocessing.Process interface hence its name
# is not following the style guide.
def run(self):
"""The main loop."""
# Prevent the KeyboardInterrupt being raised inside the worker process.
# This will prevent a collection process to generate a traceback
# when interrupted.
signal.signal(signal.SIGINT, signal.SIG_IGN)
logging.debug(u'Collection process: {0!s} started'.format(self._name))
rpc_proxy_client = None
if self._rpc_port_number:
try:
rpc_proxy_client = rpc_proxy.StandardRpcProxyClient(
self._rpc_port_number)
rpc_proxy_client.Open()
except errors.ProxyFailedToStart as exception:
logging.error((
u'Unable to setup a RPC client for the collector process with '
u'error {0:s}').format(exception))
self._collector_object.Collect()
logging.debug(u'Collection process: {0!s} stopped'.format(self._name))
if rpc_proxy_client:
_ = rpc_proxy_client.GetData(u'signal_end_of_collection')
def SignalAbort(self):
"""Signals the process to abort."""
self._collector_object.SignalAbort()
class MultiProcessEventExtractionWorkerProcess(multiprocessing.Process):
"""Class that defines a multi-processing event extraction worker process."""
def __init__(self, extraction_worker, parser_filter_string, **kwargs):
"""Initializes the process object.
Args:
extraction_worker: The extraction worker object (instance of
MultiProcessEventExtractionWorker).
parser_filter_string: Optional parser filter string. The default is None.
"""
super(MultiProcessEventExtractionWorkerProcess, self).__init__(**kwargs)
self._extraction_worker = extraction_worker
# TODO: clean this up with the implementation of a task based
# multi-processing approach.
self._parser_filter_string = parser_filter_string
# Attributes for RPC proxy server thread.
self._proxy_thread = None
self._rpc_proxy_server = None
def _StartRPCProxyServerThread(self):
"""Starts the RPC proxy server thread."""
if self._rpc_proxy_server or self._proxy_thread:
return
# Set up a simple XML RPC server for the worker for status indications.
# Since we don't know the worker's PID for now we'll set the initial port
# number to zero and then adjust it later.
self._rpc_proxy_server = rpc_proxy.StandardRpcProxyServer()
try:
self._rpc_proxy_server.SetListeningPort(os.getpid())
self._rpc_proxy_server.Open()
self._rpc_proxy_server.RegisterFunction(
'status', self._extraction_worker.GetStatus)
self._proxy_thread = threading.Thread(
name='rpc_proxy', target=self._rpc_proxy_server.StartProxy)
self._proxy_thread.start()
except errors.ProxyFailedToStart as exception:
logging.error((
u'Unable to setup a RPC server for the worker: {0:d} [PID {1:d}] '
u'with error: {2:s}').format(
self._identifier, os.getpid(), exception))
def _StopRPCProxyServerThread(self):
"""Stops the RPC proxy server thread."""
if not self._rpc_proxy_server or not self._proxy_thread:
return
# Close the proxy, free up resources so we can shut down the thread.
self._rpc_proxy_server.Close()
if self._proxy_thread.isAlive():
self._proxy_thread.join()
self._rpc_proxy_server = None
self._proxy_thread = None
# This method part of the multiprocessing.Process interface hence its name
# is not following the style guide.
def run(self):
"""The main loop."""
# Prevent the KeyboardInterrupt being raised inside the worker process.
# This will prevent a worker process to generate a traceback
# when interrupted.
signal.signal(signal.SIGINT, signal.SIG_IGN)
# We need to initialize the parser object after the process
# has forked otherwise on Windows the "fork" will fail with
# a PickleError for Python modules that cannot be pickled.
self._extraction_worker.InitalizeParserObjects(
parser_filter_string=self._parser_filter_string)
logging.debug(u'Worker process: {0!s} started'.format(self._name))
self._StartRPCProxyServerThread()
self._extraction_worker.Run()
logging.debug(u'Worker process: {0!s} stopped'.format(self._name))
self._StopRPCProxyServerThread()
def SignalAbort(self):
"""Signals the process to abort."""
self._extraction_worker.SignalAbort()
class MultiProcessStorageProcess(multiprocessing.Process):
"""Class that defines a multi-processing storage process."""
def __init__(self, storage_writer, **kwargs):
"""Initializes the process object.
Args:
storage_writer: A storage writer object (instance of BaseStorageWriter).
"""
super(MultiProcessStorageProcess, self).__init__(**kwargs)
self._storage_writer = storage_writer
# This method part of the multiprocessing.Process interface hence its name
# is not following the style guide.
def run(self):
"""The main loop."""
# Prevent the KeyboardInterrupt being raised inside the worker process.
# This will prevent a storage process to generate a traceback
# when interrupted.
signal.signal(signal.SIGINT, signal.SIG_IGN)
logging.debug(u'Storage process: {0!s} started'.format(self._name))
self._storage_writer.WriteEventObjects()
logging.debug(u'Storage process: {0!s} stopped'.format(self._name))
def SignalAbort(self):
"""Signals the process to abort."""
self._storage_writer.SignalAbort()
class MultiProcessingQueue(queue.Queue):
"""Class that defines the multi-processing queue."""
def __init__(self, maximum_number_of_queued_items=0):
"""Initializes the multi-processing queue object.
Args:
maximum_number_of_queued_items: The maximum number of queued items.
The default is 0, which represents
no limit.
"""
super(MultiProcessingQueue, self).__init__()
# maxsize contains the maximum number of items allowed to be queued,
# where 0 represents unlimited.
# We need to check that we aren't asking for a bigger queue than the
# platform supports, which requires access to this protected member.
# pylint: disable=protected-access
queue_max_length = multiprocessing._multiprocessing.SemLock.SEM_VALUE_MAX
# pylint: enable=protected-access
if maximum_number_of_queued_items > queue_max_length:
logging.warn(
u'Maximum queue size requested ({0:d}) is larger than system '
u'supported maximum size. Setting queue size to maximum supported '
u'size, '
u'({1:d})'.format(maximum_number_of_queued_items, queue_max_length))
maximum_number_of_queued_items = queue_max_length
self._queue = multiprocessing.Queue(
maxsize=maximum_number_of_queued_items)
def __len__(self):
"""Returns the estimated current number of items in the queue."""
size = 0
try:
size = self._queue.qsize()
except NotImplementedError:
logging.warning((
u'Returning queue length does not work on Mac OS X because of broken '
u'sem_getvalue()'))
raise
return size
def IsEmpty(self):
"""Determines if the queue is empty."""
return self._queue.empty()
def PushItem(self, item):
"""Pushes an item onto the queue."""
self._queue.put(item)
def PopItem(self):
"""Pops an item off the queue."""
try:
return self._queue.get()
except KeyboardInterrupt:
raise errors.QueueEmpty
@@ -0,0 +1,52 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests the multi-process processing engine."""
import unittest
from plaso.engine import test_lib
from plaso.multi_processing import multi_process
class MultiProcessingQueueTest(unittest.TestCase):
"""Tests the multi-processing queue."""
_ITEMS = frozenset(['item1', 'item2', 'item3', 'item4'])
def testPushPopItem(self):
"""Tests the PushItem and PopItem functions."""
test_queue = multi_process.MultiProcessingQueue()
for item in self._ITEMS:
test_queue.PushItem(item)
try:
self.assertEquals(len(test_queue), len(self._ITEMS))
except NotImplementedError:
# On Mac OS X because of broken sem_getvalue()
return
test_queue.SignalEndOfInput()
test_queue_consumer = test_lib.TestQueueConsumer(test_queue)
test_queue_consumer.ConsumeItems()
self.assertEquals(test_queue_consumer.number_of_items, len(self._ITEMS))
if __name__ == '__main__':
unittest.main()
+259
View File
@@ -0,0 +1,259 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains a class to get process information."""
import collections
import os
import SocketServer
import psutil
from plaso.lib import timelib
from plaso.multi_processing import rpc_proxy
class ProcessInfo(object):
"""Class that provides information about a running process."""
_MEMORY_INFORMATION = collections.namedtuple(
'memory_information', 'rss vms shared text lib data dirty percent')
def __init__(self, pid=None):
"""Initialize the process information object.
Args:
pid: Process ID (PID) value of the process to monitor. The default value
is None in which case the PID of the calling
process will be used.
Raises:
IOError: If the pid does not exist.
"""
if pid is None:
self._pid = os.getpid()
else:
self._pid = pid
if not psutil.pid_exists(self._pid):
raise IOError(u'Unable to read data from pid: {0:d}'.format(self._pid))
self._command_line = ''
self._parent = None
self._process = psutil.Process(self._pid)
if getattr(psutil, 'version_info', (0, 0, 0)) < (2, 0, 0):
self._psutil_pre_v2 = True
else:
self._psutil_pre_v2 = False
# TODO: Allow the client proxy object to determined at run time and not
# a fixed value as here.
self._rpc_client = rpc_proxy.StandardRpcProxyClient(self._pid)
self._rpc_client.Open()
@property
def pid(self):
"""Return the process ID (PID)."""
return self._pid
@property
def name(self):
"""Return the name of the process."""
if self._psutil_pre_v2:
return self._process.name
return self._process.name()
@property
def command_line(self):
"""Return the full command line used to start the process."""
if self._command_line:
return self._command_line
try:
if self._psutil_pre_v2:
command_lines = self._process.cmdline
else:
command_lines = self._process.cmdline()
self._command_line = u' '.join(command_lines)
except psutil.NoSuchProcess:
return
return self._command_line
@property
def parent(self):
"""Return a ProcessInfo object for the parent process."""
if self._parent is not None:
return self._parent
try:
if self._psutil_pre_v2:
parent_pid = self._process.parent.pid
else:
parent = self._process.parent() # pylint: disable-msg=not-callable
parent_pid = parent.pid
self._parent = ProcessInfo(pid=parent_pid)
return self._parent
except psutil.NoSuchProcess:
return
@property
def open_files(self):
"""Yield a list of open files the process has."""
try:
for open_file in self._process.get_open_files():
yield open_file.path
except (psutil.AccessDenied, psutil.NoSuchProcess):
return
@property
def children(self):
"""Yield all child processes as a ProcessInfo object."""
try:
for child in self._process.get_children():
yield ProcessInfo(pid=child.pid)
except psutil.NoSuchProcess:
# We are creating an empty generator here. Yield or return None
# individually don't provide that behavior, neither does raising
# GeneratorExit or StopIteration.
# pylint: disable=unreachable
return
yield
@property
def number_of_threads(self):
"""Return back the number of threads this process has."""
try:
return self._process.get_num_threads()
except psutil.NoSuchProcess:
return 0
@property
def memory_map(self):
"""Yield memory map objects (instance of mmap)."""
try:
for memory_map in self._process.get_memory_maps():
yield memory_map
except psutil.NoSuchProcess:
# We are creating an empty generator here. Yield or return None
# individually don't provide that behavior, neither does raising
# GeneratorExit or StopIteration.
# pylint: disable=unreachable
return
yield
@property
def status(self):
"""Return the process status."""
try:
if self._psutil_pre_v2:
return self._process.status
else:
return self._process.status()
except psutil.NoSuchProcess:
return u'exited'
@property
def start_time(self):
"""Return back the start time of the process.
Returns:
An integer representing the number of microseconds since Unix Epoch time
in UTC.
"""
if self._psutil_pre_v2:
create_time = self._process.create_time
else:
create_time = self._process.create_time()
return timelib.Timestamp.FromPosixTime(int(create_time))
@property
def io_counters(self):
"""Return back IO Counters for the process."""
try:
return self._process.get_io_counters()
except psutil.NoSuchProcess:
return
@property
def cpu_times(self):
"""Return back CPU times for the process."""
try:
return self._process.get_cpu_times()
except psutil.NoSuchProcess:
return
@property
def cpu_percent(self):
"""Return back the percent of CPU processing this process consumes."""
try:
return self._process.get_cpu_percent()
except psutil.NoSuchProcess:
return
def GetMemoryInformation(self):
"""Return back memory information as a memory_information object.
Returns:
Memory information object (instance of memory_information) a named
tuple that contains the following attributes: rss, vms, shared, text,
lib, data, dirty, percent.
"""
try:
external_information = self._process.get_ext_memory_info()
except psutil.NoSuchProcess:
return
percent = self._process.get_memory_percent()
# Psutil will return different memory information depending on what is
# available in that platform.
# TODO: Not be as strict in what gets returned, have this object more
# flexible so that the memory information returned reflects the available
# information in the platform.
return self._MEMORY_INFORMATION(
getattr(external_information, 'rss', 0),
getattr(external_information, 'vms', 0),
getattr(external_information, 'shared', 0),
getattr(external_information, 'text', 0),
getattr(external_information, 'lib', 0),
getattr(external_information, 'data', 0),
getattr(external_information, 'dirty', 0), percent)
def GetProcessStatus(self):
"""Attempt to connect to process via RPC to gather status information."""
if self._rpc_client is None:
return
try:
status = self._rpc_client.GetData('status')
if isinstance(status, dict):
return status
except SocketServer.socket.error:
return
def IsAlive(self):
"""Return a boolean value indicating if the process is alive or not."""
return self._process.is_running()
def TerminateProcess(self):
"""Terminate the process."""
# TODO: Make sure the process has really been terminated.
if self.IsAlive():
self._process.terminate()
+134
View File
@@ -0,0 +1,134 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Simple RPC proxy server and client."""
import logging
import SimpleXMLRPCServer
import SocketServer
import xmlrpclib
from xml.parsers import expat
from plaso.lib import errors
from plaso.lib import proxy
class StandardRpcProxyServer(proxy.ProxyServer):
"""Class that implements a simple XML RPC based proxy server."""
def __init__(self, port=0):
"""Initializes the RPC proxy server object.
Args:
port: The port number the proxy should listen on. Defaults to 0.
"""
super(StandardRpcProxyServer, self).__init__(
proxy.GetProxyPortNumberFromPID(port))
self._proxy = None
def Close(self):
"""Close the proxy object."""
if not self._proxy:
return
self._proxy.shutdown()
self._proxy = None
def Open(self):
"""Set up the proxy so that it can be started."""
try:
self._proxy = SimpleXMLRPCServer.SimpleXMLRPCServer(
('localhost', self.listening_port), logRequests=False,
allow_none=True)
except SocketServer.socket.error as exception:
raise errors.ProxyFailedToStart(
u'Unable to setup a RPC server for listening to port: {0:d} with '
u'error: {1:s}'.format(self.listening_port, exception))
def SetListeningPort(self, new_port_number):
"""Change the port number the proxy listens to."""
# We don't want to change the port after the proxy has been started.
if self._proxy:
logging.warning(
u'Unable to change proxy ports for an already started proxy.')
return
self._port_number = proxy.GetProxyPortNumberFromPID(new_port_number)
def StartProxy(self):
"""Start the proxy."""
if not self._proxy:
raise errors.ProxyFailedToStart(u'Proxy not set up yet.')
self._proxy.serve_forever()
def RegisterFunction(self, function_name, function):
"""Register a function to this RPC proxy.
Args:
function_name: The name of the proxy function.
function: Callback method to the function providing the requested
information.
"""
if not self._proxy:
raise errors.ProxyFailedToStart((
u'Unable to register a function for a proxy that has not been set '
u'up yet.'))
self._proxy.register_function(function, function_name)
class StandardRpcProxyClient(proxy.ProxyClient):
"""Class that implements a simple XML RPC based proxy client."""
def __init__(self, port=0):
"""Initializes the RPC proxy client object.
Args:
port: The port number the proxy should connect to. Defaults to 0.
"""
super(StandardRpcProxyClient, self).__init__(
proxy.GetProxyPortNumberFromPID(port))
self._proxy = None
def Open(self):
"""Set up the proxy so that it can be started."""
try:
self._proxy = xmlrpclib.ServerProxy(
u'http://localhost:{0:d}'.format(self._port_number), allow_none=True)
except SocketServer.socket.error:
self._proxy = None
def GetData(self, call_back_name):
"""Return back data from the RPC proxy using a callback method.
Args:
call_back_name: The name of the callback method that the RPC proxy
supports.
Returns:
The data returned back by the callback method.
"""
if self._proxy is None:
return
call_back = getattr(self._proxy, call_back_name, None)
if call_back is None:
return
try:
return call_back()
except (SocketServer.socket.error, expat.ExpatError):
return