Refactoring CLI, processing and searching

This commit is contained in:
Jason Solomon 2020-11-20 14:25:36 +11:00
parent 58d306b6e0
commit ccc9edfc6f
2 changed files with 198 additions and 3 deletions

View file

@ -24,6 +24,7 @@ from dfvfs.helpers import volume_scanner
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.resolver import resolver
from dfvfs.volume import tsk_volume_system
import pytsk3
from dfdewey.datastore.elastic import ElasticsearchDataStore
@ -71,6 +72,7 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
self._datastore = None
self._list_only_files = False
self._rows = []
self._volumes = {}
def _get_display_path(self, path_spec, path_segments, data_stream_name):
"""Retrieves a path to display.
@ -116,6 +118,24 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
inode = getattr(path_spec, 'inode', None)
return inode
def _get_tsk_partition_path_spec(self, path_spec):
"""Gets the path spec for the TSK partition.
Args:
path_spec (dfvfs.PathSpec): path spec of the volume.
Returns:
TSK partition path_spec or None.
"""
partition_path_spec = None
while path_spec.HasParent():
type_indicator = path_spec.type_indicator
if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION:
partition_path_spec = path_spec
break
path_spec = path_spec.parent
return partition_path_spec
def _get_volume_location(self, path_spec):
"""Gets volume location / identifier for the given path spec.
@ -174,6 +194,46 @@ class FileEntryScanner(volume_scanner.VolumeScanner):
self._list_file_entry(
file_system, sub_file_entry, path_segments, location)
def get_volume_extents(self, image_path):
"""Gets the extents of all volumes.
Args:
image_path (str): path of the source image.
Returns:
Volume location / identifier, offset, and size for all volumes.
"""
if not self._volumes or self._source_path != image_path:
base_path_specs = self.GetBasePathSpecs(image_path)
for path_spec in base_path_specs:
partition_path_spec = self._get_tsk_partition_path_spec(path_spec)
if not partition_path_spec:
location = getattr(path_spec, 'location', None)
self._volumes[location] = {'start': 0, 'end': None}
else:
location = getattr(partition_path_spec, 'location', None)
partition_offset = None
partition_size = None
volume_system = tsk_volume_system.TSKVolumeSystem()
try:
volume_system.Open(partition_path_spec)
volume_identifier = location.replace('/', '')
volume = volume_system.GetVolumeByIdentifier(volume_identifier)
partition_offset = volume.extents[0].offset
partition_size = volume.extents[0].size
except dfvfs_errors.VolumeSystemError as e:
log.error('Could not process partition: %s', e)
self._volumes[location] = {
'start': partition_offset,
'end': partition_offset + partition_size
}
return self._volumes
def parse_file_entries(self, base_path_specs, datastore):
"""Parses file entries in the base path specification.

View file

@ -17,10 +17,13 @@
import logging
import os
from dfvfs.lib import errors as dfvfs_errors
import pytsk3
from tabulate import tabulate
from dfdewey.datastore.elastic import ElasticsearchDataStore
from dfdewey.datastore.postgresql import PostgresqlDataStore
from dfdewey.utils.image_processor import FileEntryScanner, UnattendedVolumeScannerMediator
log = logging.getLogger('dfdewey.index_searcher')
@ -47,7 +50,7 @@ class _SearchHit():
"""
search_hit_dict = {}
search_hit_dict['Offset'] = self.offset
search_hit_dict['Filename'] = self.filename
search_hit_dict['Filename (inode)'] = self.filename
search_hit_dict['String'] = self.data
return search_hit_dict
@ -64,6 +67,7 @@ class IndexSearcher():
self.image = image
self.images = {}
self.postgresql = PostgresqlDataStore()
self.scanner = None
if image != 'all':
self.image = os.path.abspath(self.image)
@ -83,6 +87,93 @@ class IndexSearcher():
for image_hash, image_path in images:
self.images[image_hash] = image_path
def _get_filenames_from_inode(self, inode, location):
"""Gets filename(s) from an inode number.
Args:
inode: Inode number of target file
location: Partition number
Returns:
Filename of given inode or None
"""
results = self.postgresql.query((
'SELECT filename FROM files '
'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location))
filenames = []
for result in results:
filenames.append(result[0])
return filenames
def _get_filename_from_offset(self, image_path, image_hash, offset):
"""Gets filename given a byte offset within an image.
Args:
image_path: source image path.
image_hash: source image hash.
offset: byte offset within the image.
Returns:
Filename allocated to the given offset, or None.
"""
filenames = []
database_name = ''.join(('fs', image_hash))
self.postgresql.switch_database(db_name=database_name)
volume_extents = {}
try:
if not self.scanner:
mediator = UnattendedVolumeScannerMediator()
self.scanner = FileEntryScanner(mediator=mediator)
volume_extents = self.scanner.get_volume_extents(image_path)
except dfvfs_errors.ScannerError as e:
log.error('Error scanning for partitions: %s', e)
hit_location = None
partition_offset = None
for location, extent in volume_extents.items():
if not extent['end']:
# Image is of a single volume
hit_location = location
partition_offset = extent['start']
elif extent['start'] <= offset < extent['end']:
hit_location = location
partition_offset = extent['start']
if partition_offset is not None:
try:
img = pytsk3.Img_Info(image_path)
filesystem = pytsk3.FS_Info(img, offset=partition_offset)
block_size = filesystem.info.block_size
except TypeError as e:
log.error('Error opening image: %s', e)
inodes = self._get_inodes(
int((offset - partition_offset) / block_size), hit_location)
if inodes:
for i in inodes:
inode = i[0]
# Account for resident files
if (i[0] == 0 and
filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT):
mft_record_size_offset = 0x40 + partition_offset
mft_record_size = int.from_bytes(
img.read(mft_record_size_offset, 1), 'little', signed=True)
if mft_record_size < 0:
mft_record_size = 2**(mft_record_size * -1)
else:
mft_record_size = mft_record_size * block_size
inode = self._get_ntfs_resident_inode((offset - partition_offset),
filesystem, mft_record_size)
inode_filenames = self._get_filenames_from_inode(inode, hit_location)
filename = ' | '.join(inode_filenames)
filenames.append('{0:s} ({1:d})'.format(filename, inode))
return filenames
def _get_image_hash(self):
"""Get an image hash from the datastore.
@ -92,7 +183,49 @@ class IndexSearcher():
image_hash = self.postgresql.query_single_row(
'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format(
self.image))
self.images[image_hash[0]] = self.image
if image_hash:
self.images[image_hash[0]] = self.image
def _get_inodes(self, block, location):
"""Gets inode numbers for a block offset.
Args:
block (int): block offset within the image.
location (str): Partition location / identifier.
Returns:
Inode number(s) of the given block or None.
"""
inodes = self.postgresql.query(
('SELECT inum FROM blocks '
'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location))
return inodes
def _get_ntfs_resident_inode(self, offset, filesystem, mft_record_size):
"""Gets the inode number associated with NTFS $MFT resident data.
Args:
offset: data offset within volume.
filesystem: pytsk3 FS_INFO object.
mft_record_size: size of each $MFT entry.
Returns:
inode number of resident data
"""
block_size = filesystem.info.block_size
offset_block = int(offset / block_size)
inode = filesystem.open_meta(0)
mft_entry = 0
for attr in inode:
for run in attr:
for block in range(run.len):
if run.addr + block == offset_block:
mft_entry += int(
(offset - (offset_block * block_size)) / mft_record_size)
return mft_entry
mft_entry += int(block_size / mft_record_size)
return 0
def search(self, query):
"""Run a single query.
@ -115,7 +248,9 @@ class IndexSearcher():
if result['_source']['file_offset']:
offset = '-'.join((offset, result['_source']['file_offset']))
hit.offset = offset
# TODO (dfjxs): Filenames
filenames = self._get_filename_from_offset(
image_path, image_hash, result['_source']['offset'])
hit.filename = '\n'.join(filenames)
hit.data = result['_source']['data'].strip()
hits.append(hit.copy_to_dict())
output = tabulate(hits, headers='keys', tablefmt='simple')