From 15ad0beb02ccfb2e73aa0f68b4510cb9df662330 Mon Sep 17 00:00:00 2001 From: Jason Solomon Date: Fri, 20 Nov 2020 14:56:23 +1100 Subject: [PATCH] Refactoring CLI, processing and searching --- dfdewey/dfdcli.py | 2 +- dfdewey/utils/image.py | 414 -------------------------------- dfdewey/utils/index_searcher.py | 27 ++- 3 files changed, 27 insertions(+), 416 deletions(-) delete mode 100644 dfdewey/utils/image.py diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index e6bcd3d..e65aa0b 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -68,7 +68,7 @@ def main(): if args.search: index_searcher.search(args.search) elif args.search_list: - pass + index_searcher.list_search(args.search_list) def parse_args(): diff --git a/dfdewey/utils/image.py b/dfdewey/utils/image.py deleted file mode 100644 index 5f630ac..0000000 --- a/dfdewey/utils/image.py +++ /dev/null @@ -1,414 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Image File Access Functions.""" - -import pytsk3 - -from dfdewey.datastore.postgresql import PostgresqlDataStore - - -def initialise_block_db(image_path, image_hash, case): - """Creates a new image database. - - Args: - image_path: Path to image file - image_hash: MD5 of the image - case: Case ID - - Returns: - Boolean value to indicate whether the image has already been processed - """ - img = pytsk3.Img_Info(image_path) - - block_db = PostgresqlDataStore(autocommit=True) - image_exists = check_tracking_database(block_db, image_path, image_hash, case) - - if not image_exists: - db_name = ''.join(('fs', image_hash)) - block_db.execute('CREATE DATABASE {0:s}'.format(db_name)) - - block_db.switch_database(db_name=db_name) - - populate_block_db(img, block_db, batch_size=1500) - - return image_exists - - -def check_tracking_database(tracking_db, image_path, image_hash, case): - """Checks if an image exists in the tracking database. - - Checks if an image exists in the tracking database and adds it if not. - If the image exists, but is not associated with the given case ID, will add - the association. - - Args: - tracking_db: PostgreSQL database - image_path: Path to image file - image_hash: MD5 of the image - case: Case ID - - Returns: - Boolean value to indicate the existence of the image - """ - tables_exist = tracking_db.table_exists('images') - - image_exists = False - if not tables_exist: - tracking_db.execute( - 'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)') - - tracking_db.execute( - """ - CREATE TABLE image_case ( - case_id TEXT, image_hash TEXT REFERENCES images(image_hash), - PRIMARY KEY (case_id, image_hash))""") - else: - image_exists = tracking_db.value_exists('images', 'image_hash', image_hash) - - image_case_exists = False - if image_exists: - image_case = tracking_db.query_single_row( - """ - SELECT 1 from image_case - WHERE image_hash = '{0:s}' AND case_id = '{1:s}'""".format( - image_hash, case)) - if image_case: - image_case_exists = True - - if not image_exists: - tracking_db.execute( - """ - INSERT INTO images (image_path, image_hash) - VALUES ('{0:s}', '{1:s}')""".format(image_path, image_hash)) - if not image_case_exists: - tracking_db.execute( - """ - INSERT INTO image_case (case_id, image_hash) - VALUES ('{0:s}', '{1:s}')""".format(case, image_hash)) - - return image_exists - - -def populate_block_db(img, block_db, batch_size=1500): - """Creates a new image block database. - - Args: - img: pytsk image info object - block_db: PostgreSQL database - batch_size: Number of rows to insert at a time - """ - print('Image database does not already exist. Parsing image filesystem(s)...') - block_db.execute( - 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part INTEGER)') - block_db.execute( - 'CREATE TABLE files (inum INTEGER, filename TEXT, part INTEGER)') - - has_partition_table = False - try: - volume = pytsk3.Volume_Info(img) - if volume: - print('Image has a partition table...') - has_partition_table = True - rows = [] - for part in volume: - print( - 'Parsing partition {0:d}: {1:s}'.format( - part.addr, part.desc.decode('utf-8'))) - if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC: - continue - filesystem = pytsk3.FS_Info( - img, offset=part.start * volume.info.block_size) - for inode in range(filesystem.info.first_inum, - filesystem.info.last_inum + 1): - file = filesystem.open_meta(inode) - if file.info.meta.nlink > 0: - for attr in file: - for run in attr: - for block in range(run.len): - rows.append(( - run.addr + block, - inode, - part.addr, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('blocks (block, inum, part)', rows) - rows = [] - if rows: - block_db.bulk_insert('blocks (block, inum, part)', rows) - - # File names - directory = filesystem.open_dir(path='/') - list_directory(block_db, directory, part=part.addr, batch_size=batch_size) - except IOError: - pass - - if not has_partition_table: - filesystem = pytsk3.FS_Info(img) - rows = [] - for inode in range(filesystem.info.first_inum, - filesystem.info.last_inum + 1): - try: - file = filesystem.open_meta(inode) - if file.info.meta.nlink > 0: - for attr in file: - for run in attr: - for block in range(run.len): - rows.append(( - run.addr + block, - inode, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('blocks (block, inum)', rows) - rows = [] - if rows: - block_db.bulk_insert('blocks (block, inum)', rows) - except OSError: - continue - - # File names - directory = filesystem.open_dir(path='/') - list_directory(block_db, directory, batch_size=batch_size) - - block_db.execute('CREATE INDEX blocks_index ON blocks (block, part);') - block_db.execute('CREATE INDEX files_index ON files (inum, part);') - - -def list_directory( - block_db, directory, part=None, stack=None, rows=None, batch_size=1500): - """Recursive function to create a filesystem listing. - - Args: - block_db: PostgreSQL database - directory: pytsk directory object - part: Partition number - stack: Inode stack to control recursive filesystem parsing - rows: Array for batch database inserts - batch_size: Number of rows to insert at a time - - Returns: - Current rows array for recursion - """ - if not stack: - stack = [] - if not rows: - rows = [] - stack.append(directory.info.fs_file.meta.addr) - - for directory_entry in directory: - # TODO(js): Refactor - if (not hasattr(directory_entry, 'info') or - not hasattr(directory_entry.info, 'name') or - not hasattr(directory_entry.info.name, 'name') or - directory_entry.info.meta is None or - directory_entry.info.name.name in [b'.', b'..'] or - directory_entry.info.name.flags == pytsk3.TSK_FS_NAME_FLAG_UNALLOC): - continue - try: - name = directory_entry.info.name.name.decode('utf-8') - except UnicodeDecodeError: - print('Unable to decode: {}'.format(directory_entry.info.name.name)) - continue - if part: - rows.append(( - directory_entry.info.meta.addr, - name.replace('\'', '\'\''), - part, - )) - if len(rows) >= batch_size: - block_db.bulk_insert('files (inum, filename, part)', rows) - rows = [] - else: - rows.append(( - directory_entry.info.meta.addr, - name.replace('\'', '\'\''), - )) - if len(rows) >= batch_size: - block_db.bulk_insert('files (inum, filename)', rows) - rows = [] - - try: - sub_directory = directory_entry.as_directory() - inode = directory_entry.info.meta.addr - - if inode not in stack: - rows = list_directory( - block_db, sub_directory, part=part, stack=stack, rows=rows, - batch_size=batch_size) - - except IOError: - pass - - stack.pop(-1) - if not stack: - if part: - block_db.bulk_insert('files (inum, filename, part)', rows) - else: - block_db.bulk_insert('files (inum, filename)', rows) - - return rows - - -def get_filename_from_offset(image_path, image_hash, offset): - """Gets filename given a byte offset within an image. - - Args: - image_path: Source image path - image_hash: Source image hash - offset: Byte offset within the image - - Returns: - Filename allocated to the given offset - """ - img = pytsk3.Img_Info(image_path) - - db_name = ''.join(('fs', image_hash)) - block_db = PostgresqlDataStore(db_name=db_name) - - device_block_size = None - partition = None - partition_offset = None - unalloc_part = False - try: - volume = pytsk3.Volume_Info(img) - device_block_size = volume.info.block_size - sector_offset = offset / device_block_size - for part in volume: - if part.start <= sector_offset < part.start + part.len: - if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC: - unalloc_part = True - partition = part.addr - partition_offset = part.start - except IOError: - pass - - inums = None - if not unalloc_part: - try: - if not partition_offset: - filesystem = pytsk3.FS_Info(img) - else: - offset -= partition_offset * device_block_size - filesystem = pytsk3.FS_Info( - img, offset=partition_offset * device_block_size) - except TypeError as e: - print(e) - block_size = filesystem.info.block_size - - inums = get_inums(block_db, offset / block_size, part=partition) - - filenames = [] - if inums: - for i in inums: - real_inum = i[0] - if i[0] == 0 and filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT: - mft_record_size_offset = 0x40 - if partition_offset: - mft_record_size_offset = \ - mft_record_size_offset + (partition_offset * device_block_size) - mft_record_size = int.from_bytes( - img.read(mft_record_size_offset, 1), 'little', signed=True) - if mft_record_size < 0: - mft_record_size = 2**(mft_record_size * -1) - else: - mft_record_size = mft_record_size * block_size - real_inum = get_resident_inum(offset, filesystem, mft_record_size) - filename = get_filename(block_db, real_inum, part=partition) - if filename and not filenames: - filenames.append('{0:s} ({1:d})'.format(filename, real_inum)) - else: - if '{0:s} ({1:d})'.format(filename, real_inum) not in filenames: - filenames.append('{0:s} ({1:d})'.format(filename, real_inum)) - - if not filenames: - return 'No filenames found' - else: - return ' | '.join(filenames) - - -def get_inums(block_db, block, part=None): - """Gets inode number from block offset. - - Args: - block_db: PostgreSQL database - block: Block offset within the image - part: Partition number - - Returns: - Inode number(s) of the given block or None - """ - if part: - inums = block_db.query( - 'SELECT inum FROM blocks WHERE block = {0:d} AND part = {1:d}'.format( - int(block), part)) - else: - inums = block_db.query( - 'SELECT inum FROM blocks WHERE block = {0:d}'.format(int(block))) - - return inums - - -def get_resident_inum(offset, filesystem, mft_record_size): - """Gets the inode number associated with NTFS $MFT resident data. - - Args: - offset: Data offset within volume - filesystem: pytsk3 FS_INFO object - mft_record_size: Size of an $MFT entry - - Returns: - inode number of resident data - """ - block_size = filesystem.info.block_size - offset_block = int(offset / block_size) - - inode = filesystem.open_meta(0) - mft_entry = 0 - for attr in inode: - for run in attr: - for block in range(run.len): - if run.addr + block == offset_block: - mft_entry += int( - (offset - (offset_block * block_size)) / mft_record_size) - return mft_entry - else: - mft_entry += int(block_size / mft_record_size) - return 0 - - -def get_filename(block_db, inum, part=None): - """Gets filename given an inode number. - - Args: - block_db: PostgreSQL database - inum: Inode number of target file - part: Partition number - - Returns: - Filename of given inode or None - """ - if part: - filenames = block_db.query( - 'SELECT filename FROM files WHERE inum = {0:d} AND part = {1:d}'.format( - inum, part)) - else: - filenames = block_db.query( - 'SELECT filename FROM files WHERE inum = {0:d}'.format(inum)) - - if filenames: - filename = filenames[0][0] - else: - filename = 'No filenames found' - - return filename diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 843f364..f73428e 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -228,6 +228,30 @@ class IndexSearcher(): mft_entry += int(block_size / mft_record_size) return 0 + def list_search(self, query_list): + """Query a list of search terms. + + Args: + query_list (str): path to a text file containing multiple search terms. + """ + for image_hash, image_path in self.images.items(): + index = ''.join(('es', image_hash)) + with open(query_list, 'r') as search_terms: + table_data = [] + for term in search_terms: + term = ''.join(('"', term.strip(), '"')) + results = self.elasticsearch.search(index, term) + hit_count = results['hits']['total']['value'] + if hit_count > 0: + table_data.append({'Search term': term, 'Hits': hit_count}) + if table_data: + output = tabulate(table_data, headers='keys', tablefmt='simple') + else: + output = 'No results.' + log.info( + 'Searched %s (%s) for terms in %s\n\n%s\n', image_path, image_hash, + query_list, output) + def search(self, query): """Run a single query. @@ -256,4 +280,5 @@ class IndexSearcher(): hits.append(hit.copy_to_dict()) output = tabulate(hits, headers='keys', tablefmt='simple') log.info( - 'Returned %d results in %dms.\n%s', result_count, time_taken, output) + 'Returned %d results in %dms.\n\n%s\n', result_count, time_taken, + output)