From fb51db3b9a74b1246b288f2a9a0a1c5e9aa3c44e Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Thu, 19 Nov 2020 18:41:31 +1100 Subject: [PATCH 1/6] Refactoring CLI, processing and searching --- dfdewey/datastore/elastic.py | 16 +- dfdewey/datastore/postgresql.py | 23 +- dfdewey/datastore/postgresql_test.py | 2 +- dfdewey/dfdcli.py | 260 +++--------- dfdewey/utils/image_processor.py | 577 +++++++++++++++++++++++++++ dfdewey/utils/index_searcher.py | 66 +++ dfvfs_requirements.txt | 26 ++ requirements.txt | 1 + setup.py | 2 + 9 files changed, 741 insertions(+), 232 deletions(-) create mode 100644 dfdewey/utils/image_processor.py create mode 100644 dfdewey/utils/index_searcher.py create mode 100644 dfvfs_requirements.txt diff --git a/dfdewey/datastore/elastic.py b/dfdewey/datastore/elastic.py index e038494..e913da1 100644 --- a/dfdewey/datastore/elastic.py +++ b/dfdewey/datastore/elastic.py @@ -16,16 +16,11 @@ import codecs import collections -import logging from elasticsearch import Elasticsearch from elasticsearch import exceptions import six -# Setup logging -es_logger = logging.getLogger('dfdewey.elasticsearch') -es_logger.setLevel(logging.WARNING) - class ElasticsearchDataStore(): """Implements the datastore.""" @@ -138,6 +133,17 @@ class ElasticsearchDataStore(): return self.import_counter['events'] + def index_exists(self, index_name): + """Check if an index already exists. + + Args: + index_name: Name of the index + + Returns: + True if the index exists, False if not. + """ + return self.client.indices.exists(index_name) + def search(self, index_id, query_string, size=DEFAULT_SIZE): """Search ElasticSearch. diff --git a/dfdewey/datastore/postgresql.py b/dfdewey/datastore/postgresql.py index d104092..f6dcdc6 100644 --- a/dfdewey/datastore/postgresql.py +++ b/dfdewey/datastore/postgresql.py @@ -14,15 +14,9 @@ # limitations under the License. """PostgreSQL datastore.""" -import logging - import psycopg2 from psycopg2 import extras -# Setup logging -postgresql_logger = logging.getLogger('dfdewey.postgresql') -postgresql_logger.setLevel(logging.WARNING) - class PostgresqlDataStore(): """Implements the datastore.""" @@ -31,9 +25,12 @@ class PostgresqlDataStore(): self, host='127.0.0.1', port=5432, db_name='dfdewey', autocommit=False): """Create a PostgreSQL client.""" super().__init__() - self.db = psycopg2.connect( - database=db_name, user='dfdewey', password='password', host=host, - port=port) + try: + self.db = psycopg2.connect( + database=db_name, user='dfdewey', password='password', host=host, + port=port) + except psycopg2.OperationalError as e: + raise RuntimeError('Unable to connect to PostgreSQL.') from e if autocommit: self.db.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) @@ -41,9 +38,11 @@ class PostgresqlDataStore(): def __del__(self): """Finalise a PostgreSQL client.""" - if self.db: + try: self.db.commit() self.db.close() + except AttributeError: + pass def bulk_insert(self, table_spec, rows): """Execute a bulk insert into a table. @@ -53,7 +52,9 @@ class PostgresqlDataStore(): rows: Array of value tuples to be inserted """ extras.execute_values( - self.cursor, 'INSERT INTO {0:s} VALUES %s'.format(table_spec), rows) + self.cursor, + 'INSERT INTO {0:s} VALUES %s ON CONFLICT DO NOTHING'.format(table_spec), + rows) def execute(self, command): """Execute a command in the PostgreSQL database. diff --git a/dfdewey/datastore/postgresql_test.py b/dfdewey/datastore/postgresql_test.py index 61be2a2..8e90c4e 100644 --- a/dfdewey/datastore/postgresql_test.py +++ b/dfdewey/datastore/postgresql_test.py @@ -40,7 +40,7 @@ class PostgresqlTest(unittest.TestCase): rows = [(1, 1), (2, 2), (3, 3)] db.bulk_insert('blocks (block, inum)', rows) - expected_sql = 'INSERT INTO blocks (block, inum) VALUES %s' + expected_sql = 'INSERT INTO blocks (block, inum) VALUES %s ON CONFLICT DO NOTHING' mock_execute_values.assert_called_once_with(db.cursor, expected_sql, rows) def test_execute(self): diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index 9622cd6..19ae4b5 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -16,19 +16,20 @@ """DFDewey Command-Line Interface.""" import argparse -import datetime +import logging import os -import subprocess -import tempfile +import sys -from dfdewey.datastore.elastic import ElasticsearchDataStore -from dfdewey.datastore.postgresql import PostgresqlDataStore -from dfdewey.utils import image +from dfdewey.utils.image_processor import ImageProcessorOptions, ImageProcessor +from dfdewey.utils.index_searcher import IndexSearcher STRING_INDEXING_LOG_INTERVAL = 10000000 +# Setup logging +log = logging.getLogger('dfdewey') -class _StringRecord(object): + +class _StringRecord(): """Elasticsearch string record. Attributes: @@ -46,6 +47,30 @@ class _StringRecord(object): self.data = '' +def main(): + """Main DFDewey function.""" + args = parse_args() + + setup_logging() + + if not args.search and not args.search_list: + # Processing an image since no search terms specified + if args.image == 'all': + log.error('Image must be supplied for processing.') + sys.exit(1) + image_processor_options = ImageProcessorOptions( + not args.no_base64, not args.no_gzip, not args.no_zip) + image_processor = ImageProcessor( + args.case, os.path.abspath(args.image), image_processor_options) + image_processor.process_image() + else: + index_searcher = IndexSearcher(args.case, args.image) + if args.search: + pass + elif args.search_list: + pass + + def parse_args(): """Argument parsing function. @@ -54,8 +79,9 @@ def parse_args(): """ parser = argparse.ArgumentParser() - parser.add_argument('-c', '--case', required=True, help='case ID') - parser.add_argument('-i', '--image', help='image file') + parser.add_argument('case', help='case ID') + parser.add_argument( + 'image', nargs='?', default='all', help='image file (default: \'all\')') # Indexing args parser.add_argument( @@ -73,213 +99,17 @@ def parse_args(): return args -def process_image(image_file, case, base64, gunzip, unzip): - """Image processing function. +def setup_logging(): + """Configure the logger.""" + log.propagate = False + log.setLevel(logging.INFO) - Run string extraction, indexing, and filesystem parsing for image file. - - Args: - image_file: The image file to be processed - case: Case ID - base64: Flag to decode Base64 - gunzip: Flag to decompress gzip - unzip: Flag to decompress zip - """ - image_path = os.path.abspath(image_file) - output_path = tempfile.mkdtemp() - - cmd = ['bulk_extractor', '-o', output_path, '-x', 'all', '-e', 'wordlist'] - - if base64: - cmd.extend(['-e', 'base64']) - if gunzip: - cmd.extend(['-e', 'gzip']) - if unzip: - cmd.extend(['-e', 'zip']) - - cmd.extend(['-S', 'strings=YES', '-S', 'word_max=1000000']) - cmd.extend([image_path]) - - print('Processing start: {0!s}'.format(datetime.datetime.now())) - - print('\n*** Running bulk extractor:\n{0:s}'.format(' '.join(cmd))) - output = subprocess.check_output(cmd) - md5_offset = output.index(b'MD5') + 19 - image_hash = output[md5_offset:md5_offset + 32].decode('utf-8') - print('String extraction completed: {0!s}'.format(datetime.datetime.now())) - - print('\n*** Parsing image') - image_already_processed = image.initialise_block_db( - image_path, image_hash, case) - print('Parsing completed: {0!s}'.format(datetime.datetime.now())) - - if not image_already_processed: - print('\n*** Indexing image') - index_strings(output_path, image_hash) - print('Indexing completed: {0!s}'.format(datetime.datetime.now())) - else: - print('\n*** Image already indexed') - - print('Processing complete!') - - -def index_strings(output_path, image_hash): - """ElasticSearch indexing function. - - Args: - output_path: The output directory from bulk_extractor - image_hash: MD5 of the parsed image - """ - print('\n*** Indexing data...') - es = ElasticsearchDataStore() - index_name = ''.join(('es', image_hash)) - index_name = es.create_index(index_name=index_name) - print('Index {0:s} created.'.format(index_name)) - - string_list = os.path.join(output_path, 'wordlist.txt') - with open(string_list, 'r') as strings: - for line in strings: - if not line.startswith('#'): - string_record = _StringRecord() - string_record.image = image_hash - - line = line.split('\t') - offset = line[0] - data = '\t'.join(line[1:]) - if offset.find('-') > 0: - offset = offset.split('-') - image_offset = offset[0] - file_offset = '-'.join(offset[1:]) - string_record.offset = int(image_offset) - string_record.file_offset = file_offset - else: - string_record.offset = int(offset) - - string_record.data = data - records = index_record(es, index_name, string_record) - if records % STRING_INDEXING_LOG_INTERVAL == 0: - print('Indexed {0:d} records...'.format(records)) - - records = es.import_event(index_name) - print('\n*** Indexed {0:d} strings.'.format(records)) - - -def index_record(es, index_name, string_record): - """Index a single record. - - Args: - es: Elasticsearch datastore - index_name: ID of the elasticsearch index - string_record: String record to be indexed - - Returns: - Number of records processed - """ - json_record = { - 'image': string_record.image, - 'offset': string_record.offset, - 'file_offset': string_record.file_offset, - 'data': string_record.data - } - return es.import_event(index_name, event=json_record) - - -def search(query, case, image_path=None, query_list=None): - """Search function. - - Searches either the index for a single image, or indexes for all images - in a given case if no image_path is specified. - - Args: - query: The query to run against the index - case: The case to query (if no specific image is provided) - image_path: Optional path of the source image - query_list: Path to a text file containing multiple search terms - """ - case_db = PostgresqlDataStore() - images = {} - if image_path: - image_path = os.path.abspath(image_path) - - image_hash = case_db.query_single_row( - 'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format( - image_path)) - - images[image_hash[0]] = image_path - else: - print( - 'No image specified, searching all images in case \'{0:s}\''.format( - case)) - image_hashes = case_db.query( - 'SELECT image_hash FROM image_case WHERE case_id = \'{0:s}\''.format( - case)) - for image_hash in image_hashes: - image_hash = image_hash[0] - image_path = case_db.query_single_row( - 'SELECT image_path FROM images WHERE image_hash = \'{0:s}\''.format( - image_hash)) - - images[image_hash] = image_path[0] - - for image_hash, image_path in images.items(): - print('\n\nSearching {0:s} ({1:s})'.format(images[image_hash], image_hash)) - index = ''.join(('es', image_hash)) - if query_list: - with open(query_list, 'r') as search_terms: - print('\n*** Searching for terms in \'{0:s}\'...'.format(query_list)) - for term in search_terms: - term = ''.join(('"', term.strip(), '"')) - results = search_index(index, term) - if results['hits']['total']['value'] > 0: - print( - '{0:s} - {1:d} hits'.format( - term, results['hits']['total']['value'])) - else: - print('\n*** Searching for \'{0:s}\'...'.format(query)) - results = search_index(index, query) - print('Returned {0:d} results:'.format(results['hits']['total']['value'])) - for hit in results['hits']['hits']: - filename = image.get_filename_from_offset( - image_path, hit['_source']['image'], int(hit['_source']['offset'])) - if hit['_source']['file_offset']: - print( - 'Offset: {0:d}\tFile: {1:s}\tFile offset:{2:s}\t' - 'String: {3:s}'.format( - hit['_source']['offset'], filename, - hit['_source']['file_offset'], - hit['_source']['data'].strip())) - else: - print( - 'Offset: {0:d}\tFile: {1:s}\tString: {2:s}'.format( - hit['_source']['offset'], filename, - hit['_source']['data'].strip())) - - -def search_index(index_id, search_query): - """ElasticSearch search function. - - Args: - index_id: The ID of the index to be searched - search_query: The query to run against the index - - Returns: - Search results returned - """ - es = ElasticsearchDataStore() - return es.search(index_id, search_query) - - -def main(): - """Main DFDewey function.""" - args = parse_args() - if not args.search and not args.search_list: - process_image( - args.image, args.case, not args.no_base64, not args.no_gzip, - not args.no_zip) - elif args.search: - search(args.search, args.case, args.image) - elif args.search_list: - search(None, args.case, args.image, args.search_list) + # Log to stdout + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + formatter = logging.Formatter('[%(levelname)s] %(message)s') + console_handler.setFormatter(formatter) + log.addHandler(console_handler) if __name__ == '__main__': diff --git a/dfdewey/utils/image_processor.py b/dfdewey/utils/image_processor.py new file mode 100644 index 0000000..48718eb --- /dev/null +++ b/dfdewey/utils/image_processor.py @@ -0,0 +1,577 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor.""" + +from datetime import datetime +import logging +import os +import subprocess +import tempfile + +from dfvfs.helpers import volume_scanner +from dfvfs.lib import definitions as dfvfs_definitions +from dfvfs.lib import errors as dfvfs_errors +from dfvfs.resolver import resolver +import pytsk3 + +from dfdewey.datastore.elastic import ElasticsearchDataStore +from dfdewey.datastore.postgresql import PostgresqlDataStore + +BATCH_SIZE = 1500 +STRING_INDEXING_LOG_INTERVAL = 10000000 + +log = logging.getLogger('dfdewey.image_processor') + + +class _StringRecord(): + """Elasticsearch string record. + + Attributes: + image: Hash to identify the source image of the string + offset: Byte offset of the string within the source image + file_offset: If the string is extracted from a compressed stream, the byte + offset within the stream + data: The string to be indexed + """ + + def __init__(self): + self.image = '' + self.offset = 0 + self.file_offset = None + self.data = '' + + +class FileEntryScanner(volume_scanner.VolumeScanner): + """File entry scanner.""" + + _NON_PRINTABLE_CHARACTERS = list(range(0, 0x20)) + list(range(0x7f, 0xa0)) + _ESCAPE_CHARACTERS = str.maketrans({ + value: '\\x{0:02x}'.format(value) for value in _NON_PRINTABLE_CHARACTERS + }) + + def __init__(self, mediator=None): + """Initializes a file entry scanner. + + Args: + mediator (VolumeScannerMediator): a volume scanner mediator. + """ + super().__init__(mediator=mediator) + self._datastore = None + self._list_only_files = False + self._rows = [] + + def _get_display_path(self, path_spec, path_segments, data_stream_name): + """Retrieves a path to display. + + Args: + path_spec (dfvfs.PathSpec): path specification of the file entry. + path_segments (list[str]): path segments of the full path of the file + entry. + data_stream_name (str): name of the data stream. + + Returns: + str: path to display. + """ + display_path = '' + + if path_spec.HasParent(): + parent_path_spec = path_spec.parent + if parent_path_spec and parent_path_spec.type_indicator == ( + dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION): + display_path = ''.join([display_path, parent_path_spec.location]) + + path_segments = [ + segment.translate(self._ESCAPE_CHARACTERS) for segment in path_segments + ] + display_path = ''.join([display_path, '/'.join(path_segments)]) + + if data_stream_name: + data_stream_name = data_stream_name.translate(self._ESCAPE_CHARACTERS) + display_path = ':'.join([display_path, data_stream_name]) + + return display_path or '/' + + def _get_inode(self, path_spec): + """Gets the inode from a file entry path spec. + + Args: + path_spec (dfvfs.PathSpec): file entry path spec. + """ + inode = None + if path_spec.type_indicator == dfvfs_definitions.TYPE_INDICATOR_NTFS: + inode = getattr(path_spec, 'mft_entry', None) + else: + inode = getattr(path_spec, 'inode', None) + return inode + + def _get_volume_location(self, path_spec): + """Gets volume location / identifier for the given path spec. + + Args: + path_spec (dfvfs.PathSpec): path spec of the volume. + + Returns: + Volume location / identifier. + """ + location = getattr(path_spec, 'location', None) + while path_spec.HasParent(): + type_indicator = path_spec.type_indicator + if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION: + if location in ('\\', '/'): + location = getattr(path_spec, 'location', None) + break + path_spec = path_spec.parent + return location + + def _list_file_entry( + self, file_system, file_entry, parent_path_segments, location): + """Lists a file entry. + + Args: + file_system (dfvfs.FileSystem): file system that contains the file entry. + file_entry (dfvfs.FileEntry): file entry to list. + parent_path_segments (str): path segments of the full path of the parent + file entry. + location (str): volume location / identifier. + """ + path_segments = parent_path_segments + [file_entry.name] + + inode = self._get_inode(file_entry.path_spec) + filename = self._get_display_path(file_entry.path_spec, path_segments, '') + if not self._list_only_files or file_entry.IsFile(): + if inode is not None: + self._rows.append(( + inode, + filename, + location, + )) + for data_stream in file_entry.data_streams: + if not data_stream.IsDefault(): + filename = ':'.join((filename, data_stream.name)) + self._rows.append(( + inode, + filename, + location, + )) + if len(self._rows) >= BATCH_SIZE: + self._datastore.bulk_insert( + 'files (inum, filename, part)', self._rows) + self._rows = [] + + for sub_file_entry in file_entry.sub_file_entries: + self._list_file_entry( + file_system, sub_file_entry, path_segments, location) + + def parse_file_entries(self, base_path_specs, datastore): + """Parses file entries in the base path specification. + + Stores parsed entries in the PostgreSQL datastore. + + Args: + base_path_specs (list[dfvfs.PathSpec]): source path specification. + datastore (PostgresqlDataStore): PostgreSQL datastore. + """ + self._datastore = datastore + for base_path_spec in base_path_specs: + file_system = resolver.Resolver.OpenFileSystem(base_path_spec) + file_entry = resolver.Resolver.OpenFileEntry(base_path_spec) + if file_entry is None: + log.warning( + 'Unable to open base path specification: %s', + base_path_spec.comparable) + return + + location = self._get_volume_location(base_path_spec) + self._list_file_entry(file_system, file_entry, [], location) + if self._rows: + self._datastore.bulk_insert('files (inum, filename, part)', self._rows) + self._rows = [] + + +class ImageProcessor(): + """Image processor class. + + Attributes: + case (str): case ID. + elasticsearch (ElasticsearchDataStore): elasticsearch datastore. + image_hash (str): MD5 hash of the image. + image_path (str): path to source image. + options (ImageProcessorOptions): image processor options. + output_path (str): output directory for string extraction. + path_specs (dfvfs.PathSpec): volume path specs. + postgresql (PostgresqlDataStore): postgresql database. + scanner (FileEntryScanner): dfvfs volume / file entry scanner. + """ + + def __init__(self, case, image_path, options): + """Create an image processor.""" + super().__init__() + self.case = case + self.elasticsearch = None + self.image_hash = None + self.image_path = image_path + self.options = options + self.output_path = None + self.path_specs = [] + self.postgresql = None + self.scanner = None + + def _already_parsed(self): + """Check if image is already parsed. + + Checks whether the image is already in the database. + If so, checks whether it's attached to the case. + Adds the image to the database and attaches it to the case. + + Returns: + True if image has already been parsed, False if not. + """ + tables_exist = self.postgresql.table_exists('images') + + image_exists = False + if not tables_exist: + self._initialise_database() + else: + image_exists = self.postgresql.value_exists( + 'images', 'image_hash', self.image_hash) + + # Even if the image has already been parsed, it may have been in a different + # case. + image_case_exists = False + if image_exists: + image_case = self.postgresql.query_single_row(( + 'SELECT 1 from image_case ' + 'WHERE image_hash = \'{0:s}\' AND case_id = \'{1:s}\'').format( + self.image_hash, self.case)) + if image_case: + image_case_exists = True + else: + self.postgresql.execute(( + 'INSERT INTO images (image_path, image_hash) ' + 'VALUES (\'{0:s}\', \'{1:s}\')').format( + self.image_path, self.image_hash)) + + if not image_case_exists: + self.postgresql.execute(( + 'INSERT INTO image_case (case_id, image_hash) ' + 'VALUES (\'{0:s}\', \'{1:s}\')').format(self.case, self.image_hash)) + + return image_exists + + def _create_filesystem_database(self): + """Create a filesystem database for the image.""" + self.postgresql.execute(( + 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, ' + 'PRIMARY KEY (block, inum, part))')) + self.postgresql.execute(( + 'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, ' + 'PRIMARY KEY (inum, filename, part))')) + + def _extract_strings(self): + """String extraction. + + Extract strings from the image using bulk_extractor. + """ + cmd = [ + 'bulk_extractor', '-o', self.output_path, '-x', 'all', '-e', 'wordlist' + ] + + if self.options.base64: + cmd.extend(['-e', 'base64']) + if self.options.gunzip: + cmd.extend(['-e', 'gzip']) + if self.options.unzip: + cmd.extend(['-e', 'zip']) + + cmd.extend(['-S', 'strings=YES', '-S', 'word_max=1000000']) + cmd.append(self.image_path) + + log.info('Running bulk_extractor: [%s]', ' '.join(cmd)) + try: + output = subprocess.check_output(cmd) + except subprocess.CalledProcessError as e: + raise RuntimeError('String extraction failed.') from e + md5_offset = output.index(b'MD5') + 19 + self.image_hash = output[md5_offset:md5_offset + 32].decode('utf-8') + + def _get_volume_details(self, path_spec): + """Logs volume details for the given path spec. + + Args: + path_spec (dfvfs.PathSpec): path spec of the volume. + + Returns: + Volume location / identifier and byte offset. + """ + location = getattr(path_spec, 'location', None) + start_offset = 0 + while path_spec.HasParent(): + type_indicator = path_spec.type_indicator + if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION: + if location in ('\\', '/'): + location = getattr(path_spec, 'location', None) + start_offset = getattr(path_spec, 'start_offset', 0) + break + path_spec = path_spec.parent + return location, start_offset + + def _index_record(self, index_name, string_record): + """Index a single record. + + Args: + index_name: ID of the elasticsearch index. + string_record: String record to be indexed. + + Returns: + Number of records processed + """ + json_record = { + 'image': string_record.image, + 'offset': string_record.offset, + 'file_offset': string_record.file_offset, + 'data': string_record.data + } + return self.elasticsearch.import_event(index_name, event=json_record) + + def _index_strings(self): + """Index the extracted strings.""" + self.elasticsearch = ElasticsearchDataStore() + index_name = ''.join(('es', self.image_hash)) + if self.elasticsearch.index_exists(index_name): + log.info('Image already indexed: [%s]', self.image_path) + else: + index_name = self.elasticsearch.create_index(index_name=index_name) + log.info('Index %s created.', index_name) + + string_list = os.path.join(self.output_path, 'wordlist.txt') + records = 0 + with open(string_list, 'r') as strings: + for line in strings: + # Ignore the comments added by bulk_extractor + if not line.startswith('#'): + string_record = _StringRecord() + string_record.image = self.image_hash + + # Split each string into offset and data + line = line.split('\t') + offset = line[0] + data = '\t'.join(line[1:]) + + # If the string is from a decoded / decompressed stream, split the + # offset into image offset and file offset + if offset.find('-') > 0: + offset = offset.split('-') + image_offset = offset[0] + file_offset = '-'.join(offset[1:]) + string_record.offset = int(image_offset) + string_record.file_offset = file_offset + else: + string_record.offset = int(offset) + + string_record.data = data + records = self._index_record(index_name, string_record) + + if records % STRING_INDEXING_LOG_INTERVAL == 0: + log.info('Indexed %d records...', records) + # Flush the import buffer + records = self.elasticsearch.import_event(index_name) + log.info('Indexed %d records...', records) + + def _initialise_database(self): + """Initialse the image database.""" + self.postgresql.execute( + 'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)') + + self.postgresql.execute(( + 'CREATE TABLE image_case (' + 'case_id TEXT, image_hash TEXT REFERENCES images(image_hash), ' + 'PRIMARY KEY (case_id, image_hash))')) + + def _parse_filesystems(self): + """Filesystem parsing. + + Parse each filesystem to create a mapping from byte offsets to files. + """ + self.postgresql = PostgresqlDataStore(autocommit=True) + if self._already_parsed(): + log.info('Image already parsed: [%s]', self.image_path) + else: + db_name = ''.join(('fs', self.image_hash)) + self.postgresql.execute('CREATE DATABASE {0:s}'.format(db_name)) + self.postgresql.switch_database(db_name=db_name) + + self._create_filesystem_database() + + # Scan image for volumes + mediator = UnattendedVolumeScannerMediator() + try: + self.scanner = FileEntryScanner(mediator=mediator) + self.path_specs = self.scanner.GetBasePathSpecs(self.image_path) + log.info( + 'Found %d volume%s in [%s]:', len(self.path_specs), + '' if len(self.path_specs) == 1 else 's', self.image_path) + except dfvfs_errors.ScannerError as e: + log.error('Error scanning for partitions: %s', e) + + for path_spec in self.path_specs: + location, start_offset = self._get_volume_details(path_spec) + log.info( + '%s: %s (Offset %d)', location, path_spec.type_indicator, + start_offset) + if path_spec.type_indicator in (dfvfs_definitions.TYPE_INDICATOR_NTFS, + dfvfs_definitions.TYPE_INDICATOR_TSK): + self._parse_inodes(location, start_offset) + self.scanner.parse_file_entries([path_spec], self.postgresql) + else: + log.warning( + 'Volume type %s is not supported.', path_spec.type_indicator) + + def _parse_inodes(self, location, start_offset): + """Parse filesystem inodes. + + Create a mapping from blocks to inodes. + + Args: + location (str): location / identifier of the volume. + start_offset (int): byte offset of the volume. + """ + rows = [] + image = pytsk3.Img_Info(self.image_path) + filesystem = pytsk3.FS_Info(image, offset=start_offset) + for inode in range(filesystem.info.first_inum, + filesystem.info.last_inum + 1): + file_metadata = filesystem.open_meta(inode) + if file_metadata.info.meta.nlink > 0: + for attribute in file_metadata: + for run in attribute: + for block in range(run.len): + rows.append(( + run.addr + block, + inode, + location, + )) + if len(rows) >= BATCH_SIZE: + self.postgresql.bulk_insert('blocks (block, inum, part)', rows) + rows = [] + if rows: + self.postgresql.bulk_insert('blocks (block, inum, part)', rows) + + def process_image(self): + """Process the image.""" + self.output_path = tempfile.mkdtemp() + log.info('* Processing start: %s', datetime.now()) + self._extract_strings() + log.info('String extraction complete.') + + log.info('* Parsing image: %s', datetime.now()) + self._parse_filesystems() + log.info('Parsing complete.') + + log.info('* Indexing strings: %s', datetime.now()) + self._index_strings() + log.info('Indexing complete.') + + log.info('* Processing complete: %s', datetime.now()) + + +class ImageProcessorOptions(): + """Image processor options. + + Attributes: + base64 (bool): decode base64. + gunzip (bool): decompress gzip. + unzip (bool): decompress zip. + """ + + def __init__(self, base64=True, gunzip=True, unzip=True): + """Initialise image processor options.""" + super().__init__() + self.base64 = base64 + self.gunzip = gunzip + self.unzip = unzip + + +class UnattendedVolumeScannerMediator(volume_scanner.VolumeScannerMediator): + """Unattended volume scanner mediator.""" + + def GetAPFSVolumeIdentifiers(self, volume_system, volume_identifiers): + """Retrieves APFS volume identifiers. + + In an unattended execution, this method returns all volume identifiers. + + Args: + volume_system (APFSVolumeSystem): volume system. + volume_identifiers (list[str]): volume identifiers including prefix. + + Returns: + list[str]: all volume identifiers including prefix. + """ + prefix = 'apfs' + return [ + '{0:s}{1:d}'.format(prefix, volume_index) + for volume_index in range(1, volume_system.number_of_volumes + 1) + ] + + def GetPartitionIdentifiers(self, volume_system, volume_identifiers): + """Retrieves partition identifiers. + + In an unattended execution, this method returns all partition identifiers. + + Args: + volume_system (TSKVolumeSystem): volume system. + volume_identifiers (list[str]): volume identifiers including prefix. + + Returns: + list[str]: all volume identifiers including prefix. + """ + prefix = 'p' + return [ + '{0:s}{1:d}'.format(prefix, volume_index) + for volume_index in range(1, volume_system.number_of_volumes + 1) + ] + + def GetVSSStoreIdentifiers(self, volume_system, volume_identifiers): + """Retrieves VSS store identifiers. + + Placeholder method for VSS support. + + Args: + volume_system (VShadowVolumeSystem): volume system. + volume_identifiers (list[str]): volume identifiers including prefix. + + Returns: + list[str]: None. + """ + return [] + + def UnlockEncryptedVolume( + self, source_scanner_object, scan_context, locked_scan_node, credentials): + """Unlocks an encrypted volume. + + Placeholder method for encrypted volume support. + + Args: + source_scanner_object (SourceScanner): source scanner. + scan_context (SourceScannerContext): source scanner context. + locked_scan_node (SourceScanNode): locked scan node. + credentials (Credentials): credentials supported by the locked scan node. + + Returns: + bool: True if the volume was unlocked. + """ + log.warning( + 'Encrypted volumes are currently unsupported: %s', + locked_scan_node.path_spec.CopyToDict()) + return False diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py new file mode 100644 index 0000000..e5c06bf --- /dev/null +++ b/dfdewey/utils/index_searcher.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Index searcher.""" + +import logging +import os + +from dfdewey.datastore.elastic import ElasticsearchDataStore +from dfdewey.datastore.postgresql import PostgresqlDataStore + +log = logging.getLogger('dfdewey.index_searcher') + + +class IndexSearcher(): + """Index Searcher class.""" + + def __init__(self, case, image): + """Create an index searcher.""" + super().__init__() + self.case = case + self.elasticsearch = ElasticsearchDataStore() + self.image = image + self.images = {} + self.postgresql = PostgresqlDataStore() + + if image != 'all': + self.image = os.path.abspath(self.image) + self._get_image_hash() + else: + self._get_case_images() + print(self.images) + + def _get_case_images(self): + """Get all images for the case. + + Returns: + A dictionary of the images in the case. + """ + images = self.postgresql.query(( + 'SELECT image_hash, image_path FROM image_case NATURAL JOIN images ' + 'WHERE case_id = \'{0:s}\'').format(self.case)) + for image_hash, image_path in images: + self.images[image_hash] = image_path + + def _get_image_hash(self): + """Get an image hash from the datastore. + + Returns: + MD5 hash for the image stored in PostgreSQL. + """ + image_hash = self.postgresql.query_single_row( + 'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format( + self.image)) + self.images[image_hash[0]] = self.image diff --git a/dfvfs_requirements.txt b/dfvfs_requirements.txt new file mode 100644 index 0000000..dadf6c1 --- /dev/null +++ b/dfvfs_requirements.txt @@ -0,0 +1,26 @@ +pip >= 7.0.0 +PyYAML >= 3.10 +cffi >= 1.9.1 +cryptography >= 2.0.2 +dfdatetime >= 20200809 +dtfabric >= 20170524 +idna >= 2.5 +libbde-python >= 20140531 +libewf-python >= 20131210 +libfsapfs-python >= 20201107 +libfsext-python >= 20200819 +libfshfs-python >= 20201103 +libfsntfs-python >= 20200921 +libfsxfs-python >= 20201114 +libfvde-python >= 20160719 +libfwnt-python >= 20160418 +libluksde-python >= 20200101 +libqcow-python >= 20131204 +libsigscan-python >= 20191221 +libsmdev-python >= 20140529 +libsmraw-python >= 20140612 +libvhdi-python >= 20201014 +libvmdk-python >= 20140421 +libvshadow-python >= 20160109 +libvslvm-python >= 20160109 +pytsk3 >= 20160721 diff --git a/requirements.txt b/requirements.txt index 24eaec6..3831027 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +dfvfs elasticsearch psycopg2-binary pytsk3 diff --git a/setup.py b/setup.py index e0964c5..dc46346 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,8 @@ DFDEWEY_DESCRIPTION = ( requirements = [] with open('requirements.txt','r') as f: requirements = f.read().splitlines() +with open('dfvfs_requirements.txt','r') as f: + requirements.extend(f.read().splitlines()) setup( name='dfDewey', version=dfdewey.__version__, From 58d306b6e0734986ca59246f2622478bbc2f7ca6 Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 10:26:12 +1100 Subject: [PATCH 2/6] Refactoring CLI, processing and searching --- dfdewey/dfdcli.py | 2 +- dfdewey/utils/index_searcher.py | 59 ++++++++++++++++++++++++++++++++- requirements.txt | 1 + 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index 19ae4b5..e6bcd3d 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -66,7 +66,7 @@ def main(): else: index_searcher = IndexSearcher(args.case, args.image) if args.search: - pass + index_searcher.search(args.search) elif args.search_list: pass diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index e5c06bf..10cccfb 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -17,12 +17,42 @@ import logging import os +from tabulate import tabulate + from dfdewey.datastore.elastic import ElasticsearchDataStore from dfdewey.datastore.postgresql import PostgresqlDataStore log = logging.getLogger('dfdewey.index_searcher') +class _SearchHit(): + """Search result. + + Attributes: + offset: byte offset of the string within the source image. + filename: filename containing the string if applicable. + data: the responsive string. + """ + + def __init__(self): + self.offset = 0 + self.filename = None + self.data = '' + + def copy_to_dict(self): + """Copies the search hit to a dictionary. + + Returns: + dict[str, object]: search hit attributes. + """ + search_hit_dict = {} + search_hit_dict['Offset'] = self.offset + search_hit_dict['Filename'] = self.filename + search_hit_dict['String'] = self.data + + return search_hit_dict + + class IndexSearcher(): """Index Searcher class.""" @@ -40,7 +70,6 @@ class IndexSearcher(): self._get_image_hash() else: self._get_case_images() - print(self.images) def _get_case_images(self): """Get all images for the case. @@ -64,3 +93,31 @@ class IndexSearcher(): 'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format( self.image)) self.images[image_hash[0]] = self.image + + def search(self, query): + """Run a single query. + + Args: + query (str): query to run. + """ + for image_hash, image_path in self.images.items(): + log.info('Searching %s (%s) for "%s"', image_path, image_hash, query) + index = ''.join(('es', image_hash)) + results = self.elasticsearch.search(index, query) + result_count = results['hits']['total']['value'] + time_taken = results['took'] + + results = results['hits']['hits'] + hits = [] + for result in results: + hit = _SearchHit() + offset = str(result['_source']['offset']) + if result['_source']['file_offset']: + offset = '-'.join((offset, result['_source']['file_offset'])) + hit.offset = offset + # TODO (dfjxs): Filenames + hit.data = result['_source']['data'].strip() + hits.append(hit.copy_to_dict()) + output = tabulate(hits, headers='keys', tablefmt='simple') + log.info( + 'Returned %d results in %dms.\n%s', result_count, time_taken, output) diff --git a/requirements.txt b/requirements.txt index 3831027..d3a50a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ elasticsearch psycopg2-binary pytsk3 six +tabulate From ccc9edfc6f74988d7ea8289ced6ecf6f4585313f Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 14:25:36 +1100 Subject: [PATCH 3/6] Refactoring CLI, processing and searching --- dfdewey/utils/image_processor.py | 60 +++++++++++++ dfdewey/utils/index_searcher.py | 141 ++++++++++++++++++++++++++++++- 2 files changed, 198 insertions(+), 3 deletions(-) diff --git a/dfdewey/utils/image_processor.py b/dfdewey/utils/image_processor.py index 48718eb..b66305e 100644 --- a/dfdewey/utils/image_processor.py +++ b/dfdewey/utils/image_processor.py @@ -24,6 +24,7 @@ from dfvfs.helpers import volume_scanner from dfvfs.lib import definitions as dfvfs_definitions from dfvfs.lib import errors as dfvfs_errors from dfvfs.resolver import resolver +from dfvfs.volume import tsk_volume_system import pytsk3 from dfdewey.datastore.elastic import ElasticsearchDataStore @@ -71,6 +72,7 @@ class FileEntryScanner(volume_scanner.VolumeScanner): self._datastore = None self._list_only_files = False self._rows = [] + self._volumes = {} def _get_display_path(self, path_spec, path_segments, data_stream_name): """Retrieves a path to display. @@ -116,6 +118,24 @@ class FileEntryScanner(volume_scanner.VolumeScanner): inode = getattr(path_spec, 'inode', None) return inode + def _get_tsk_partition_path_spec(self, path_spec): + """Gets the path spec for the TSK partition. + + Args: + path_spec (dfvfs.PathSpec): path spec of the volume. + + Returns: + TSK partition path_spec or None. + """ + partition_path_spec = None + while path_spec.HasParent(): + type_indicator = path_spec.type_indicator + if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION: + partition_path_spec = path_spec + break + path_spec = path_spec.parent + return partition_path_spec + def _get_volume_location(self, path_spec): """Gets volume location / identifier for the given path spec. @@ -174,6 +194,46 @@ class FileEntryScanner(volume_scanner.VolumeScanner): self._list_file_entry( file_system, sub_file_entry, path_segments, location) + def get_volume_extents(self, image_path): + """Gets the extents of all volumes. + + Args: + image_path (str): path of the source image. + + Returns: + Volume location / identifier, offset, and size for all volumes. + """ + if not self._volumes or self._source_path != image_path: + base_path_specs = self.GetBasePathSpecs(image_path) + + for path_spec in base_path_specs: + partition_path_spec = self._get_tsk_partition_path_spec(path_spec) + if not partition_path_spec: + location = getattr(path_spec, 'location', None) + self._volumes[location] = {'start': 0, 'end': None} + else: + location = getattr(partition_path_spec, 'location', None) + partition_offset = None + partition_size = None + + volume_system = tsk_volume_system.TSKVolumeSystem() + try: + volume_system.Open(partition_path_spec) + volume_identifier = location.replace('/', '') + volume = volume_system.GetVolumeByIdentifier(volume_identifier) + + partition_offset = volume.extents[0].offset + partition_size = volume.extents[0].size + except dfvfs_errors.VolumeSystemError as e: + log.error('Could not process partition: %s', e) + + self._volumes[location] = { + 'start': partition_offset, + 'end': partition_offset + partition_size + } + + return self._volumes + def parse_file_entries(self, base_path_specs, datastore): """Parses file entries in the base path specification. diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 10cccfb..4b9ab1a 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -17,10 +17,13 @@ import logging import os +from dfvfs.lib import errors as dfvfs_errors +import pytsk3 from tabulate import tabulate from dfdewey.datastore.elastic import ElasticsearchDataStore from dfdewey.datastore.postgresql import PostgresqlDataStore +from dfdewey.utils.image_processor import FileEntryScanner, UnattendedVolumeScannerMediator log = logging.getLogger('dfdewey.index_searcher') @@ -47,7 +50,7 @@ class _SearchHit(): """ search_hit_dict = {} search_hit_dict['Offset'] = self.offset - search_hit_dict['Filename'] = self.filename + search_hit_dict['Filename (inode)'] = self.filename search_hit_dict['String'] = self.data return search_hit_dict @@ -64,6 +67,7 @@ class IndexSearcher(): self.image = image self.images = {} self.postgresql = PostgresqlDataStore() + self.scanner = None if image != 'all': self.image = os.path.abspath(self.image) @@ -83,6 +87,93 @@ class IndexSearcher(): for image_hash, image_path in images: self.images[image_hash] = image_path + def _get_filenames_from_inode(self, inode, location): + """Gets filename(s) from an inode number. + + Args: + inode: Inode number of target file + location: Partition number + + Returns: + Filename of given inode or None + """ + results = self.postgresql.query(( + 'SELECT filename FROM files ' + 'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location)) + filenames = [] + for result in results: + filenames.append(result[0]) + return filenames + + def _get_filename_from_offset(self, image_path, image_hash, offset): + """Gets filename given a byte offset within an image. + + Args: + image_path: source image path. + image_hash: source image hash. + offset: byte offset within the image. + + Returns: + Filename allocated to the given offset, or None. + """ + filenames = [] + + database_name = ''.join(('fs', image_hash)) + self.postgresql.switch_database(db_name=database_name) + + volume_extents = {} + try: + if not self.scanner: + mediator = UnattendedVolumeScannerMediator() + self.scanner = FileEntryScanner(mediator=mediator) + volume_extents = self.scanner.get_volume_extents(image_path) + except dfvfs_errors.ScannerError as e: + log.error('Error scanning for partitions: %s', e) + + hit_location = None + partition_offset = None + for location, extent in volume_extents.items(): + if not extent['end']: + # Image is of a single volume + hit_location = location + partition_offset = extent['start'] + elif extent['start'] <= offset < extent['end']: + hit_location = location + partition_offset = extent['start'] + + if partition_offset is not None: + try: + img = pytsk3.Img_Info(image_path) + filesystem = pytsk3.FS_Info(img, offset=partition_offset) + block_size = filesystem.info.block_size + except TypeError as e: + log.error('Error opening image: %s', e) + + inodes = self._get_inodes( + int((offset - partition_offset) / block_size), hit_location) + + if inodes: + for i in inodes: + inode = i[0] + # Account for resident files + if (i[0] == 0 and + filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT): + mft_record_size_offset = 0x40 + partition_offset + mft_record_size = int.from_bytes( + img.read(mft_record_size_offset, 1), 'little', signed=True) + if mft_record_size < 0: + mft_record_size = 2**(mft_record_size * -1) + else: + mft_record_size = mft_record_size * block_size + inode = self._get_ntfs_resident_inode((offset - partition_offset), + filesystem, mft_record_size) + + inode_filenames = self._get_filenames_from_inode(inode, hit_location) + filename = ' | '.join(inode_filenames) + filenames.append('{0:s} ({1:d})'.format(filename, inode)) + + return filenames + def _get_image_hash(self): """Get an image hash from the datastore. @@ -92,7 +183,49 @@ class IndexSearcher(): image_hash = self.postgresql.query_single_row( 'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format( self.image)) - self.images[image_hash[0]] = self.image + if image_hash: + self.images[image_hash[0]] = self.image + + def _get_inodes(self, block, location): + """Gets inode numbers for a block offset. + + Args: + block (int): block offset within the image. + location (str): Partition location / identifier. + + Returns: + Inode number(s) of the given block or None. + """ + inodes = self.postgresql.query( + ('SELECT inum FROM blocks ' + 'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location)) + return inodes + + def _get_ntfs_resident_inode(self, offset, filesystem, mft_record_size): + """Gets the inode number associated with NTFS $MFT resident data. + + Args: + offset: data offset within volume. + filesystem: pytsk3 FS_INFO object. + mft_record_size: size of each $MFT entry. + + Returns: + inode number of resident data + """ + block_size = filesystem.info.block_size + offset_block = int(offset / block_size) + + inode = filesystem.open_meta(0) + mft_entry = 0 + for attr in inode: + for run in attr: + for block in range(run.len): + if run.addr + block == offset_block: + mft_entry += int( + (offset - (offset_block * block_size)) / mft_record_size) + return mft_entry + mft_entry += int(block_size / mft_record_size) + return 0 def search(self, query): """Run a single query. @@ -115,7 +248,9 @@ class IndexSearcher(): if result['_source']['file_offset']: offset = '-'.join((offset, result['_source']['file_offset'])) hit.offset = offset - # TODO (dfjxs): Filenames + filenames = self._get_filename_from_offset( + image_path, image_hash, result['_source']['offset']) + hit.filename = '\n'.join(filenames) hit.data = result['_source']['data'].strip() hits.append(hit.copy_to_dict()) output = tabulate(hits, headers='keys', tablefmt='simple') From 2f78367a37a8d4ea65c0c53cd9fc0e95a4d265be Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 14:29:00 +1100 Subject: [PATCH 4/6] Refactoring CLI, processing and searching --- dfdewey/utils/index_searcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 4b9ab1a..843f364 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -23,7 +23,8 @@ from tabulate import tabulate from dfdewey.datastore.elastic import ElasticsearchDataStore from dfdewey.datastore.postgresql import PostgresqlDataStore -from dfdewey.utils.image_processor import FileEntryScanner, UnattendedVolumeScannerMediator +from dfdewey.utils.image_processor import ( + FileEntryScanner, UnattendedVolumeScannerMediator) log = logging.getLogger('dfdewey.index_searcher') From 15ad0beb02ccfb2e73aa0f68b4510cb9df662330 Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 14:56:23 +1100 Subject: [PATCH 5/6] Refactoring CLI, processing and searching --- dfdewey/dfdcli.py | 2 +- dfdewey/utils/image.py | 414 -------------------------------- dfdewey/utils/index_searcher.py | 27 ++- 3 files changed, 27 insertions(+), 416 deletions(-) delete mode 100644 dfdewey/utils/image.py diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index e6bcd3d..e65aa0b 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -68,7 +68,7 @@ def main(): if args.search: index_searcher.search(args.search) elif args.search_list: - pass + index_searcher.list_search(args.search_list) def parse_args(): diff --git a/dfdewey/utils/image.py b/dfdewey/utils/image.py deleted file mode 100644 index 5f630ac..0000000 --- a/dfdewey/utils/image.py +++ /dev/null @@ -1,414 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Image File Access Functions.""" - -import pytsk3 - -from dfdewey.datastore.postgresql import PostgresqlDataStore - - -def initialise_block_db(image_path, image_hash, case): - """Creates a new image database. - - Args: - image_path: Path to image file - image_hash: MD5 of the image - case: Case ID - - Returns: - Boolean value to indicate whether the image has already been processed - """ - img = pytsk3.Img_Info(image_path) - - block_db = PostgresqlDataStore(autocommit=True) - image_exists = check_tracking_database(block_db, image_path, image_hash, case) - - if not image_exists: - db_name = ''.join(('fs', image_hash)) - block_db.execute('CREATE DATABASE {0:s}'.format(db_name)) - - block_db.switch_database(db_name=db_name) - - populate_block_db(img, block_db, batch_size=1500) - - return image_exists - - -def check_tracking_database(tracking_db, image_path, image_hash, case): - """Checks if an image exists in the tracking database. - - Checks if an image exists in the tracking database and adds it if not. - If the image exists, but is not associated with the given case ID, will add - the association. - - Args: - tracking_db: PostgreSQL database - image_path: Path to image file - image_hash: MD5 of the image - case: Case ID - - Returns: - Boolean value to indicate the existence of the image - """ - tables_exist = tracking_db.table_exists('images') - - image_exists = False - if not tables_exist: - tracking_db.execute( - 'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)') - - tracking_db.execute( - """ - CREATE TABLE image_case ( - case_id TEXT, image_hash TEXT REFERENCES images(image_hash), - PRIMARY KEY (case_id, image_hash))""") - else: - image_exists = tracking_db.value_exists('images', 'image_hash', image_hash) - - image_case_exists = False - if image_exists: - image_case = tracking_db.query_single_row( - """ - SELECT 1 from image_case - WHERE image_hash = '{0:s}' AND case_id = '{1:s}'""".format( - image_hash, case)) - if image_case: - image_case_exists = True - - if not image_exists: - tracking_db.execute( - """ - INSERT INTO images (image_path, image_hash) - VALUES ('{0:s}', '{1:s}')""".format(image_path, image_hash)) - if not image_case_exists: - tracking_db.execute( - """ - INSERT INTO image_case (case_id, image_hash) - VALUES ('{0:s}', '{1:s}')""".format(case, image_hash)) - - return image_exists - - -def populate_block_db(img, block_db, batch_size=1500): - """Creates a new image block database. - - Args: - img: pytsk image info object - block_db: PostgreSQL database - batch_size: Number of rows to insert at a time - """ - print('Image database does not already exist. Parsing image filesystem(s)...') - block_db.execute( - 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part INTEGER)') - block_db.execute( - 'CREATE TABLE files (inum INTEGER, filename TEXT, part INTEGER)') - - has_partition_table = False - try: - volume = pytsk3.Volume_Info(img) - if volume: - print('Image has a partition table...') - has_partition_table = True - rows = [] - for part in volume: - print( - 'Parsing partition {0:d}: {1:s}'.format( - part.addr, part.desc.decode('utf-8'))) - if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC: - continue - filesystem = pytsk3.FS_Info( - img, offset=part.start * volume.info.block_size) - for inode in range(filesystem.info.first_inum, - filesystem.info.last_inum + 1): - file = filesystem.open_meta(inode) - if file.info.meta.nlink > 0: - for attr in file: - for run in attr: - for block in range(run.len): - rows.append(( - run.addr + block, - inode, - part.addr, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('blocks (block, inum, part)', rows) - rows = [] - if rows: - block_db.bulk_insert('blocks (block, inum, part)', rows) - - # File names - directory = filesystem.open_dir(path='/') - list_directory(block_db, directory, part=part.addr, batch_size=batch_size) - except IOError: - pass - - if not has_partition_table: - filesystem = pytsk3.FS_Info(img) - rows = [] - for inode in range(filesystem.info.first_inum, - filesystem.info.last_inum + 1): - try: - file = filesystem.open_meta(inode) - if file.info.meta.nlink > 0: - for attr in file: - for run in attr: - for block in range(run.len): - rows.append(( - run.addr + block, - inode, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('blocks (block, inum)', rows) - rows = [] - if rows: - block_db.bulk_insert('blocks (block, inum)', rows) - except OSError: - continue - - # File names - directory = filesystem.open_dir(path='/') - list_directory(block_db, directory, batch_size=batch_size) - - block_db.execute('CREATE INDEX blocks_index ON blocks (block, part);') - block_db.execute('CREATE INDEX files_index ON files (inum, part);') - - -def list_directory( - block_db, directory, part=None, stack=None, rows=None, batch_size=1500): - """Recursive function to create a filesystem listing. - - Args: - block_db: PostgreSQL database - directory: pytsk directory object - part: Partition number - stack: Inode stack to control recursive filesystem parsing - rows: Array for batch database inserts - batch_size: Number of rows to insert at a time - - Returns: - Current rows array for recursion - """ - if not stack: - stack = [] - if not rows: - rows = [] - stack.append(directory.info.fs_file.meta.addr) - - for directory_entry in directory: - # TODO(js): Refactor - if (not hasattr(directory_entry, 'info') or - not hasattr(directory_entry.info, 'name') or - not hasattr(directory_entry.info.name, 'name') or - directory_entry.info.meta is None or - directory_entry.info.name.name in [b'.', b'..'] or - directory_entry.info.name.flags == pytsk3.TSK_FS_NAME_FLAG_UNALLOC): - continue - try: - name = directory_entry.info.name.name.decode('utf-8') - except UnicodeDecodeError: - print('Unable to decode: {}'.format(directory_entry.info.name.name)) - continue - if part: - rows.append(( - directory_entry.info.meta.addr, - name.replace('\'', '\'\''), - part, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('files (inum, filename, part)', rows) - rows = [] - else: - rows.append(( - directory_entry.info.meta.addr, - name.replace('\'', '\'\''), - )) - if len(rows) >= batch_size: - block_db.bulk_insert('files (inum, filename)', rows) - rows = [] - - try: - sub_directory = directory_entry.as_directory() - inode = directory_entry.info.meta.addr - - if inode not in stack: - rows = list_directory( - block_db, sub_directory, part=part, stack=stack, rows=rows, - batch_size=batch_size) - - except IOError: - pass - - stack.pop(-1) - if not stack: - if part: - block_db.bulk_insert('files (inum, filename, part)', rows) - else: - block_db.bulk_insert('files (inum, filename)', rows) - - return rows - - -def get_filename_from_offset(image_path, image_hash, offset): - """Gets filename given a byte offset within an image. - - Args: - image_path: Source image path - image_hash: Source image hash - offset: Byte offset within the image - - Returns: - Filename allocated to the given offset - """ - img = pytsk3.Img_Info(image_path) - - db_name = ''.join(('fs', image_hash)) - block_db = PostgresqlDataStore(db_name=db_name) - - device_block_size = None - partition = None - partition_offset = None - unalloc_part = False - try: - volume = pytsk3.Volume_Info(img) - device_block_size = volume.info.block_size - sector_offset = offset / device_block_size - for part in volume: - if part.start <= sector_offset < part.start + part.len: - if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC: - unalloc_part = True - partition = part.addr - partition_offset = part.start - except IOError: - pass - - inums = None - if not unalloc_part: - try: - if not partition_offset: - filesystem = pytsk3.FS_Info(img) - else: - offset -= partition_offset * device_block_size - filesystem = pytsk3.FS_Info( - img, offset=partition_offset * device_block_size) - except TypeError as e: - print(e) - block_size = filesystem.info.block_size - - inums = get_inums(block_db, offset / block_size, part=partition) - - filenames = [] - if inums: - for i in inums: - real_inum = i[0] - if i[0] == 0 and filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT: - mft_record_size_offset = 0x40 - if partition_offset: - mft_record_size_offset = \ - mft_record_size_offset + (partition_offset * device_block_size) - mft_record_size = int.from_bytes( - img.read(mft_record_size_offset, 1), 'little', signed=True) - if mft_record_size < 0: - mft_record_size = 2**(mft_record_size * -1) - else: - mft_record_size = mft_record_size * block_size - real_inum = get_resident_inum(offset, filesystem, mft_record_size) - filename = get_filename(block_db, real_inum, part=partition) - if filename and not filenames: - filenames.append('{0:s} ({1:d})'.format(filename, real_inum)) - else: - if '{0:s} ({1:d})'.format(filename, real_inum) not in filenames: - filenames.append('{0:s} ({1:d})'.format(filename, real_inum)) - - if not filenames: - return 'No filenames found' - else: - return ' | '.join(filenames) - - -def get_inums(block_db, block, part=None): - """Gets inode number from block offset. - - Args: - block_db: PostgreSQL database - block: Block offset within the image - part: Partition number - - Returns: - Inode number(s) of the given block or None - """ - if part: - inums = block_db.query( - 'SELECT inum FROM blocks WHERE block = {0:d} AND part = {1:d}'.format( - int(block), part)) - else: - inums = block_db.query( - 'SELECT inum FROM blocks WHERE block = {0:d}'.format(int(block))) - - return inums - - -def get_resident_inum(offset, filesystem, mft_record_size): - """Gets the inode number associated with NTFS $MFT resident data. - - Args: - offset: Data offset within volume - filesystem: pytsk3 FS_INFO object - mft_record_size: Size of an $MFT entry - - Returns: - inode number of resident data - """ - block_size = filesystem.info.block_size - offset_block = int(offset / block_size) - - inode = filesystem.open_meta(0) - mft_entry = 0 - for attr in inode: - for run in attr: - for block in range(run.len): - if run.addr + block == offset_block: - mft_entry += int( - (offset - (offset_block * block_size)) / mft_record_size) - return mft_entry - else: - mft_entry += int(block_size / mft_record_size) - return 0 - - -def get_filename(block_db, inum, part=None): - """Gets filename given an inode number. - - Args: - block_db: PostgreSQL database - inum: Inode number of target file - part: Partition number - - Returns: - Filename of given inode or None - """ - if part: - filenames = block_db.query( - 'SELECT filename FROM files WHERE inum = {0:d} AND part = {1:d}'.format( - inum, part)) - else: - filenames = block_db.query( - 'SELECT filename FROM files WHERE inum = {0:d}'.format(inum)) - - if filenames: - filename = filenames[0][0] - else: - filename = 'No filenames found' - - return filename diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 843f364..f73428e 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -228,6 +228,30 @@ class IndexSearcher(): mft_entry += int(block_size / mft_record_size) return 0 + def list_search(self, query_list): + """Query a list of search terms. + + Args: + query_list (str): path to a text file containing multiple search terms. + """ + for image_hash, image_path in self.images.items(): + index = ''.join(('es', image_hash)) + with open(query_list, 'r') as search_terms: + table_data = [] + for term in search_terms: + term = ''.join(('"', term.strip(), '"')) + results = self.elasticsearch.search(index, term) + hit_count = results['hits']['total']['value'] + if hit_count > 0: + table_data.append({'Search term': term, 'Hits': hit_count}) + if table_data: + output = tabulate(table_data, headers='keys', tablefmt='simple') + else: + output = 'No results.' + log.info( + 'Searched %s (%s) for terms in %s\n\n%s\n', image_path, image_hash, + query_list, output) + def search(self, query): """Run a single query. @@ -256,4 +280,5 @@ class IndexSearcher(): hits.append(hit.copy_to_dict()) output = tabulate(hits, headers='keys', tablefmt='simple') log.info( - 'Returned %d results in %dms.\n%s', result_count, time_taken, output) + 'Returned %d results in %dms.\n\n%s\n', result_count, time_taken, + output) From 6dba7383e504a179455e325e74cd7a2133fb9e6a Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 15:01:48 +1100 Subject: [PATCH 6/6] Refactoring CLI, processing and searching --- docs/usage.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index ff45a4f..8adc5b4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,14 +1,14 @@ # Using dfDewey ```shell -usage: dfdcli.py [-h] -c CASE [-i IMAGE] [--no_base64] [--no_gzip] [--no_zip] - [-s SEARCH] [--search_list SEARCH_LIST] +usage: dfdcli.py [-h] [--no_base64] [--no_gzip] [--no_zip] [-s SEARCH] [--search_list SEARCH_LIST] case [image] + +positional arguments: + case case ID + image image file (default: 'all') optional arguments: -h, --help show this help message and exit - -c CASE, --case CASE case ID - -i IMAGE, --image IMAGE - image file --no_base64 don't decode base64 --no_gzip don't decompress gzip --no_zip don't decompress zip @@ -16,6 +16,7 @@ optional arguments: search query --search_list SEARCH_LIST file with search queries + ``` ## Docker @@ -59,7 +60,7 @@ docker run --network=host -v ~/images/:/mnt/images dfdewey -h To process an image in dfDewey, you need to supply a `CASE` and `IMAGE`. ```shell -dfdcli.py -c testcase -i /path/to/image.dd +dfdcli.py testcase /path/to/image.dd ``` dfDewey will have bulk_extractor decode base64 data, and decompress gzip / zip @@ -72,7 +73,7 @@ To search the index for a single image, you need to supply a `CASE`, `IMAGE`, and `SEARCH`. ```shell -dfdcli.py -c testcase -i /path/to/image.dd -s foo +dfdcli.py testcase /path/to/image.dd -s 'foo' ``` If an `IMAGE` is not provided, dfDewey will search all images in the given case. @@ -82,5 +83,5 @@ a text file one per line. In this case, only the number of results for each term is returned. ```shell -dfdcli.py -c testcase -i /path/to/image.dd --search_list search_terms.txt +dfdcli.py testcase /path/to/image.dd --search_list search_terms.txt ```