Refactoring CLI, processing and searching

2020-11-20 14:25:36 +11:00 · 2020-11-20 14:25:36 +11:00 · ccc9edfc6f
commit ccc9edfc6f
parent 58d306b6e0
2 changed files with 198 additions and 3 deletions
--- a/dfdewey/utils/image_processor.py
+++ b/dfdewey/utils/image_processor.py
@ -24,6 +24,7 @@ from dfvfs.helpers import volume_scanner
 from dfvfs.lib import definitions as dfvfs_definitions
 from dfvfs.lib import errors as dfvfs_errors
 from dfvfs.resolver import resolver
+from dfvfs.volume import tsk_volume_system
 import pytsk3

 from dfdewey.datastore.elastic import ElasticsearchDataStore
@ -71,6 +72,7 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
    self._datastore = None
    self._list_only_files = False
    self._rows = []
+    self._volumes = {}

  def _get_display_path(self, path_spec, path_segments, data_stream_name):
    """Retrieves a path to display.
@ -116,6 +118,24 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
      inode = getattr(path_spec, 'inode', None)
    return inode

+  def _get_tsk_partition_path_spec(self, path_spec):
+    """Gets the path spec for the TSK partition.
+
+    Args:
+      path_spec (dfvfs.PathSpec): path spec of the volume.
+
+    Returns:
+      TSK partition path_spec or None.
+    """
+    partition_path_spec = None
+    while path_spec.HasParent():
+      type_indicator = path_spec.type_indicator
+      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION:
+        partition_path_spec = path_spec
+        break
+      path_spec = path_spec.parent
+    return partition_path_spec
+
  def _get_volume_location(self, path_spec):
    """Gets volume location / identifier for the given path spec.

@ -174,6 +194,46 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
      self._list_file_entry(
          file_system, sub_file_entry, path_segments, location)

+  def get_volume_extents(self, image_path):
+    """Gets the extents of all volumes.
+
+    Args:
+      image_path (str): path of the source image.
+
+    Returns:
+      Volume location / identifier, offset, and size for all volumes.
+    """
+    if not self._volumes or self._source_path != image_path:
+      base_path_specs = self.GetBasePathSpecs(image_path)
+
+      for path_spec in base_path_specs:
+        partition_path_spec = self._get_tsk_partition_path_spec(path_spec)
+        if not partition_path_spec:
+          location = getattr(path_spec, 'location', None)
+          self._volumes[location] = {'start': 0, 'end': None}
+        else:
+          location = getattr(partition_path_spec, 'location', None)
+          partition_offset = None
+          partition_size = None
+
+          volume_system = tsk_volume_system.TSKVolumeSystem()
+          try:
+            volume_system.Open(partition_path_spec)
+            volume_identifier = location.replace('/', '')
+            volume = volume_system.GetVolumeByIdentifier(volume_identifier)
+
+            partition_offset = volume.extents[0].offset
+            partition_size = volume.extents[0].size
+          except dfvfs_errors.VolumeSystemError as e:
+            log.error('Could not process partition: %s', e)
+
+          self._volumes[location] = {
+              'start': partition_offset,
+              'end': partition_offset + partition_size
+          }
+
+    return self._volumes
+
  def parse_file_entries(self, base_path_specs, datastore):
    """Parses file entries in the base path specification.

--- a/dfdewey/utils/index_searcher.py
+++ b/dfdewey/utils/index_searcher.py
@ -17,10 +17,13 @@
 import logging
 import os

+from dfvfs.lib import errors as dfvfs_errors
+import pytsk3
 from tabulate import tabulate

 from dfdewey.datastore.elastic import ElasticsearchDataStore
 from dfdewey.datastore.postgresql import PostgresqlDataStore
+from dfdewey.utils.image_processor import FileEntryScanner, UnattendedVolumeScannerMediator

 log = logging.getLogger('dfdewey.index_searcher')

@ -47,7 +50,7 @@ class _SearchHit():
    """
    search_hit_dict = {}
    search_hit_dict['Offset'] = self.offset
-    search_hit_dict['Filename'] = self.filename
+    search_hit_dict['Filename (inode)'] = self.filename
    search_hit_dict['String'] = self.data

    return search_hit_dict
@ -64,6 +67,7 @@ class IndexSearcher():
    self.image = image
    self.images = {}
    self.postgresql = PostgresqlDataStore()
+    self.scanner = None

    if image != 'all':
      self.image = os.path.abspath(self.image)
@ -83,6 +87,93 @@ class IndexSearcher():
    for image_hash, image_path in images:
      self.images[image_hash] = image_path

+  def _get_filenames_from_inode(self, inode, location):
+    """Gets filename(s) from an inode number.
+
+    Args:
+      inode: Inode number of target file
+      location: Partition number
+
+    Returns:
+      Filename of given inode or None
+    """
+    results = self.postgresql.query((
+        'SELECT filename FROM files '
+        'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location))
+    filenames = []
+    for result in results:
+      filenames.append(result[0])
+    return filenames
+
+  def _get_filename_from_offset(self, image_path, image_hash, offset):
+    """Gets filename given a byte offset within an image.
+
+    Args:
+      image_path: source image path.
+      image_hash: source image hash.
+      offset: byte offset within the image.
+
+    Returns:
+      Filename allocated to the given offset, or None.
+    """
+    filenames = []
+
+    database_name = ''.join(('fs', image_hash))
+    self.postgresql.switch_database(db_name=database_name)
+
+    volume_extents = {}
+    try:
+      if not self.scanner:
+        mediator = UnattendedVolumeScannerMediator()
+        self.scanner = FileEntryScanner(mediator=mediator)
+      volume_extents = self.scanner.get_volume_extents(image_path)
+    except dfvfs_errors.ScannerError as e:
+      log.error('Error scanning for partitions: %s', e)
+
+    hit_location = None
+    partition_offset = None
+    for location, extent in volume_extents.items():
+      if not extent['end']:
+        # Image is of a single volume
+        hit_location = location
+        partition_offset = extent['start']
+      elif extent['start'] <= offset < extent['end']:
+        hit_location = location
+        partition_offset = extent['start']
+
+    if partition_offset is not None:
+      try:
+        img = pytsk3.Img_Info(image_path)
+        filesystem = pytsk3.FS_Info(img, offset=partition_offset)
+        block_size = filesystem.info.block_size
+      except TypeError as e:
+        log.error('Error opening image: %s', e)
+
+      inodes = self._get_inodes(
+          int((offset - partition_offset) / block_size), hit_location)
+
+      if inodes:
+        for i in inodes:
+          inode = i[0]
+          # Account for resident files
+          if (i[0] == 0 and
+              filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT):
+            mft_record_size_offset = 0x40 + partition_offset
+            mft_record_size = int.from_bytes(
+                img.read(mft_record_size_offset, 1), 'little', signed=True)
+            if mft_record_size < 0:
+              mft_record_size = 2**(mft_record_size * -1)
+            else:
+              mft_record_size = mft_record_size * block_size
+            inode = self._get_ntfs_resident_inode((offset - partition_offset),
+                                                  filesystem, mft_record_size)
+
+          inode_filenames = self._get_filenames_from_inode(inode, hit_location)
+          filename = ' | '.join(inode_filenames)
+          filenames.append('{0:s} ({1:d})'.format(filename, inode))
+
+    return filenames
+
  def _get_image_hash(self):
    """Get an image hash from the datastore.

@ -92,7 +183,49 @@ class IndexSearcher():
    image_hash = self.postgresql.query_single_row(
        'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format(
            self.image))
-    self.images[image_hash[0]] = self.image
+    if image_hash:
+      self.images[image_hash[0]] = self.image
+
+  def _get_inodes(self, block, location):
+    """Gets inode numbers for a block offset.
+
+    Args:
+      block (int): block offset within the image.
+      location (str): Partition location / identifier.
+
+    Returns:
+      Inode number(s) of the given block or None.
+    """
+    inodes = self.postgresql.query(
+        ('SELECT inum FROM blocks '
+         'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location))
+    return inodes
+
+  def _get_ntfs_resident_inode(self, offset, filesystem, mft_record_size):
+    """Gets the inode number associated with NTFS $MFT resident data.
+
+    Args:
+      offset: data offset within volume.
+      filesystem: pytsk3 FS_INFO object.
+      mft_record_size: size of each $MFT entry.
+
+    Returns:
+      inode number of resident data
+    """
+    block_size = filesystem.info.block_size
+    offset_block = int(offset / block_size)
+
+    inode = filesystem.open_meta(0)
+    mft_entry = 0
+    for attr in inode:
+      for run in attr:
+        for block in range(run.len):
+          if run.addr + block == offset_block:
+            mft_entry += int(
+                (offset - (offset_block * block_size)) / mft_record_size)
+            return mft_entry
+          mft_entry += int(block_size / mft_record_size)
+    return 0

  def search(self, query):
    """Run a single query.
@ -115,7 +248,9 @@ class IndexSearcher():
        if result['_source']['file_offset']:
          offset = '-'.join((offset, result['_source']['file_offset']))
        hit.offset = offset
-        # TODO (dfjxs): Filenames
+        filenames = self._get_filename_from_offset(
+            image_path, image_hash, result['_source']['offset'])
+        hit.filename = '\n'.join(filenames)
        hit.data = result['_source']['data'].strip()
        hits.append(hit.copy_to_dict())
      output = tabulate(hits, headers='keys', tablefmt='simple')