plaso-rubanetra/plaso/lib/utils.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains utility functions."""

import logging

from plaso.lib import errors
from plaso.lib import lexer


RESERVED_VARIABLES = frozenset(
    ['username', 'inode', 'hostname', 'body', 'parser', 'regvalue', 'timestamp',
     'timestamp_desc', 'source_short', 'source_long', 'timezone', 'filename',
     'display_name', 'pathspec', 'offset', 'store_number', 'store_index',
     'tag', 'data_type', 'metadata', 'http_headers', 'query', 'mapped_files',
     'uuid'])


def IsText(bytes_in, encoding=None):
  """Examine the bytes in and determine if they are indicative of a text.

  Parsers need quick and at least semi reliable method of discovering whether
  or not a particular byte stream is a text or resembles text or not. This can
  be used in text parsers to determine if a file is a text file or not for
  instance.

  The method assumes the byte sequence is either ASCII, UTF-8, UTF-16 or method
  supplied character encoding. Otherwise it will make the assumption the byte
  sequence is not text, but a byte sequence.

  Args:
    bytes_in: The byte sequence passed to the method that needs examination.
    encoding: Optional encoding to test, if not defined only ASCII, UTF-8 and
    UTF-16 are tried.

  Returns:
    Boolean value indicating whether or not the byte sequence is a text or not.
  """
  # TODO: Improve speed and accuracy of this method.
  # Start with the assumption we are dealing with a text.
  is_ascii = True

  # Check if this is ASCII text string.
  for char in bytes_in:
    if not 31 < ord(char) < 128:
      is_ascii = False
      break

  # We have an ASCII string.
  if is_ascii:
    return is_ascii

  # Is this already a unicode text?
  if type(bytes_in) == unicode:
    return True

  # Check if this is UTF-8
  try:
    _ = bytes_in.decode('utf-8')
    return True
  except UnicodeDecodeError:
    pass

  # TODO: UTF 16 decode is successful in too
  # many edge cases where we are not really dealing with
  # a text at all. Leaving this out for now, consider
  # re-enabling or making a better determination.
  #try:
  #  _ = bytes_in.decode('utf-16-le')
  #  return True
  #except UnicodeDecodeError:
  #  pass

  if encoding:
    try:
      _ = bytes_in.decode(encoding)
      return True
    except UnicodeDecodeError:
      pass
    except LookupError:
      logging.error(
          u'String encoding not recognized: {0:s}'.format(encoding))

  return False


def GetBaseName(path):
  """Returns back a basename for a path (could be Windows or *NIX separated)."""
  # First check the case where both forward and backward slash are in the path.
  if '/' and '\\' in path:
    # Let's count slashes and guess which one is the right one.
    forward_count = len(path.split('/'))
    backward_count = len(path.split('\\'))

    if forward_count > backward_count:
      _, _, base = path.rpartition('/')
    else:
      _, _, base = path.rpartition('\\')

    return base

  # Now we are sure there is only one type of separators.
  if '/' in path:
    _, _, base = path.rpartition('/')
  else:
    _, _, base = path.rpartition('\\')

  return base


def GetUnicodeString(string):
  """Converts the string to Unicode if necessary."""
  if type(string) != unicode:
    return str(string).decode('utf8', 'ignore')
  return string


class PathReplacer(lexer.Lexer):
  """Replace path variables with values gathered from earlier preprocessing."""

  tokens = [
      lexer.Token('.', '{{([^}]+)}}', 'ReplaceVariable', ''),
      lexer.Token('.', '{([^}]+)}', 'ReplaceString', ''),
      lexer.Token('.', '([^{])', 'ParseString', ''),
      ]

  def __init__(self, pre_obj, data=''):
    """Constructor for a path replacer."""
    super(PathReplacer, self).__init__(data)
    self._path = []
    self._pre_obj = pre_obj

  def GetPath(self):
    """Run the lexer and replace path."""
    while True:
      _ = self.NextToken()
      if self.Empty():
        break

    return u''.join(self._path)

  def ParseString(self, match, **_):
    """Append a string to the path."""
    self._path.append(match.group(1))

  def ReplaceVariable(self, match, **_):
    """Replace a string that should not be a variable."""
    self._path.append(u'{{{0:s}}}'.format(match.group(1)))

  def ReplaceString(self, match, **_):
    """Replace a variable with a given attribute."""
    replace = getattr(self._pre_obj, match.group(1), None)

    if replace:
      self._path.append(replace)
    else:
      raise errors.PathNotFound(
          u'Path variable: {} not discovered yet.'.format(match.group(1)))


def GetInodeValue(inode_raw):
  """Read in a 'raw' inode value and try to convert it into an integer.

  Args:
    inode_raw: A string or an int inode value.

  Returns:
    An integer inode value.
  """
  if type(inode_raw) in (int, long):
    return inode_raw

  if type(inode_raw) is float:
    return int(inode_raw)

  try:
    return int(inode_raw)
  except ValueError:
    # Let's do one more attempt.
    inode_string, _, _ = str(inode_raw).partition('-')
    try:
      return int(inode_string)
    except ValueError:
      return -1