From 7aadd41ee20f26b0c0cb75107d0b4172c52baeac Mon Sep 17 00:00:00 2001 From: Jason <52063018+dfjxs@users.noreply.github.com> Date: Fri, 3 Jun 2022 15:35:43 +1000 Subject: [PATCH] Add image reparse and deletion functions (#31) * Update readme for bulk_extractor v2.0.0 * Update docker image to Ubuntu 20.04 * Parse filesystem before string extraction * Refactor postgres datastore code * Add reparse option * Add option to delete image data * Update usage * Update version --- README.md | 9 +- dfdewey/__init__.py | 2 +- dfdewey/datastore/postgresql.py | 216 ++++++++++++++++++++++++-- dfdewey/datastore/postgresql_test.py | 127 ++++++++++++++- dfdewey/dfdcli.py | 10 +- dfdewey/utils/image_processor.py | 170 ++++++++++++-------- dfdewey/utils/image_processor_test.py | 130 ++++++++-------- dfdewey/utils/index_searcher.py | 73 ++------- dfdewey/utils/index_searcher_test.py | 23 +-- docker/Dockerfile | 4 +- docs/usage.md | 14 +- 11 files changed, 541 insertions(+), 237 deletions(-) diff --git a/README.md b/README.md index 8a8b844..fff61a9 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,8 @@ dfDewey is a digital forensics string extraction, indexing, and searching tool. ## Requirements ### bulk_extractor dfDewey currently requires bulk_extractor for string extraction. -bulk_extractor can be downloaded and built from source here: -https://github.com/simsong/bulk_extractor -bulk_extractor can also be installed from the GIFT PPA. +bulk_extractor can be installed from the GIFT PPA. ```shell sudo add-apt-repository ppa:gift/stable @@ -19,6 +17,11 @@ sudo apt update sudo apt install -y bulk-extractor ``` +bulk_extractor can also be downloaded and built from source here: +https://github.com/simsong/bulk_extractor + +Note: bulk_extractor v1.6.0 is recommended (v2.0.0 is not yet supported). + ### dfVFS [dfVFS](https://github.com/log2timeline/dfvfs) is required for image parsing. It can be installed from the GIFT PPA. diff --git a/dfdewey/__init__.py b/dfdewey/__init__.py index 298299c..24c9dbb 100644 --- a/dfdewey/__init__.py +++ b/dfdewey/__init__.py @@ -17,4 +17,4 @@ dfDewey is a digital forensics string extraction, indexing, and searching tool. """ -__version__ = '20211220' +__version__ = '20220603' diff --git a/dfdewey/datastore/postgresql.py b/dfdewey/datastore/postgresql.py index f6dcdc6..6ab9313 100644 --- a/dfdewey/datastore/postgresql.py +++ b/dfdewey/datastore/postgresql.py @@ -44,19 +44,7 @@ class PostgresqlDataStore(): except AttributeError: pass - def bulk_insert(self, table_spec, rows): - """Execute a bulk insert into a table. - - Args: - table_spec: String in the form 'table_name (col1, col2, ..., coln)' - rows: Array of value tuples to be inserted - """ - extras.execute_values( - self.cursor, - 'INSERT INTO {0:s} VALUES %s ON CONFLICT DO NOTHING'.format(table_spec), - rows) - - def execute(self, command): + def _execute(self, command): """Execute a command in the PostgreSQL database. Args: @@ -64,7 +52,7 @@ class PostgresqlDataStore(): """ self.cursor.execute(command) - def query(self, query): + def _query(self, query): """Query the database. Args: @@ -77,7 +65,7 @@ class PostgresqlDataStore(): return self.cursor.fetchall() - def query_single_row(self, query): + def _query_single_row(self, query): """Query the database for a single row. Args: @@ -90,6 +78,191 @@ class PostgresqlDataStore(): return self.cursor.fetchone() + def bulk_insert(self, table_spec, rows): + """Execute a bulk insert into a table. + + Args: + table_spec: String in the form 'table_name (col1, col2, ..., coln)' + rows: Array of value tuples to be inserted + """ + extras.execute_values( + self.cursor, + 'INSERT INTO {0:s} VALUES %s ON CONFLICT DO NOTHING'.format(table_spec), + rows) + + def create_database(self, db_name): + """Create a database for the image. + + Args: + db_name: Database name + """ + self._execute('CREATE DATABASE {0:s}'.format(db_name)) + + def create_filesystem_database(self): + """Create a filesystem database for the image.""" + self._execute(( + 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, ' + 'PRIMARY KEY (block, inum, part))')) + self._execute(( + 'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, ' + 'PRIMARY KEY (inum, filename, part))')) + + def delete_filesystem_database(self, db_name): + """Delete the filesystem database for the image. + + Args: + db_name: The name of the database to drop + """ + self._execute('DROP DATABASE {0:s}'.format(db_name)) + + def delete_image(self, image_id): + """Delete an image from the database. + + Args: + image_id: Image identifier + """ + self._execute( + 'DELETE FROM images WHERE image_id = \'{0:s}\''.format(image_id)) + + def get_case_images(self, case): + """Get all images for the case. + + Args: + case: Case name + + Returns: + A dictionary of the images in the case. + """ + images = {} + results = self._query(( + 'SELECT image_hash, image_path FROM image_case NATURAL JOIN images ' + 'WHERE case_id = \'{0:s}\'').format(case)) + for image_hash, image_path in results: + images[image_hash] = image_path + return images + + def get_filenames_from_inode(self, inode, location): + """Gets filename(s) from an inode number. + + Args: + inode: Inode number of target file + location: Partition number + + Returns: + Filename(s) of given inode or None + """ + results = self._query(( + 'SELECT filename FROM files ' + 'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location)) + filenames = [] + for result in results: + filenames.append(result[0]) + return filenames + + def get_image_cases(self, image_id): + """Get a list of cases the image is linked to. + + Args: + image_id: Image identifier + + Returns: + List of cases or None. + """ + cases = self._query( + 'SELECT case_id FROM image_case WHERE image_id = \'{0:s}\''.format( + image_id)) + for c in range(len(cases)): + cases[c] = cases[c][0] + return cases + + def get_image_hash(self, image_id): + """Get an image hash from the database. + + Args: + image_id: Image identifier + + Returns: + Hash for the image stored in PostgreSQL or None. + """ + image_hash = self._query_single_row( + 'SELECT image_hash FROM images WHERE image_id = \'{0:s}\''.format( + image_id)) + if image_hash: + return image_hash[0] + else: + return None + + def get_inodes(self, block, location): + """Gets inode numbers for a block offset. + + Args: + block (int): block offset within the image. + location (str): Partition location / identifier. + + Returns: + Inode number(s) of the given block or None. + """ + inodes = self._query( + ('SELECT inum FROM blocks ' + 'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location)) + for i in range(len(inodes)): + inodes[i] = inodes[i][0] + return inodes + + def initialise_database(self): + """Initialse the image database.""" + self._execute(( + 'CREATE TABLE images (image_id TEXT PRIMARY KEY, image_path TEXT, ' + 'image_hash TEXT)')) + + self._execute(( + 'CREATE TABLE image_case (' + 'case_id TEXT, image_id TEXT REFERENCES images(image_id), ' + 'PRIMARY KEY (case_id, image_id))')) + + def insert_image(self, image_id, image_path, image_hash): + """Add an image to the database. + + Args: + image_id: Image identifier + image_path: Path to the image file + image_hash: Hash of the image + """ + self._execute(( + 'INSERT INTO images (image_id, image_path, image_hash) ' + 'VALUES (\'{0:s}\', \'{1:s}\', \'{2:s}\')').format( + image_id, image_path, image_hash)) + + def is_image_in_case(self, image_id, case): + """Check if an image is attached to a case. + + Args: + image_id: Image identifier + case: Case name + + Returns: + True if the image is attached to the case, otherwise False. + """ + image_case = self._query_single_row(( + 'SELECT 1 from image_case ' + 'WHERE image_id = \'{0:s}\' AND case_id = \'{1:s}\'').format( + image_id, case)) + if image_case: + return True + else: + return False + + def link_image_to_case(self, image_id, case): + """Attaches an image to a case. + + Args: + image_id: Image identifier + case: Case name + """ + self._execute(( + 'INSERT INTO image_case (case_id, image_id) ' + 'VALUES (\'{0:s}\', \'{1:s}\')').format(case, image_id)) + def switch_database( self, host='127.0.0.1', port=5432, db_name='dfdewey', autocommit=False): """Connects to a different database. @@ -128,6 +301,19 @@ class PostgresqlDataStore(): return self.cursor.fetchone() is not None + def unlink_image_from_case(self, image_id, case): + """Removes an image from a case. + + Args: + image_id: Image identifier + case: Case name + """ + self._execute( + """ + DELETE FROM image_case + WHERE case_id = '{0:s}' AND image_id = '{1:s}'""".format( + case, image_id)) + def value_exists(self, table_name, column_name, value): """Check if a value exists in a table. diff --git a/dfdewey/datastore/postgresql_test.py b/dfdewey/datastore/postgresql_test.py index 8254c7c..ac07d00 100644 --- a/dfdewey/datastore/postgresql_test.py +++ b/dfdewey/datastore/postgresql_test.py @@ -20,6 +20,7 @@ import mock from psycopg2 import OperationalError from dfdewey.datastore.postgresql import PostgresqlDataStore +from dfdewey.utils.image_processor_test import TEST_CASE, TEST_IMAGE, TEST_IMAGE_HASH, TEST_IMAGE_ID class PostgresqlTest(unittest.TestCase): @@ -47,15 +48,90 @@ class PostgresqlTest(unittest.TestCase): 'VALUES %s ON CONFLICT DO NOTHING') mock_execute_values.assert_called_once_with(db.cursor, expected_sql, rows) + def test_create_filesystem_database(self): + """Test create filesystem database method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.create_filesystem_database() + + calls = [ + mock.call(( + 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, ' + 'PRIMARY KEY (block, inum, part))')), + mock.call(( + 'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, ' + 'PRIMARY KEY (inum, filename, part))')) + ] + mock_execute.assert_has_calls(calls) + + def test_delete_filesystem_database(self): + """Test delete filesystem database method.""" + db = self._get_datastore() + db_name = ''.join(('fs', TEST_IMAGE_HASH)) + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.delete_filesystem_database(db_name) + mock_execute.assert_called_once_with( + 'DROP DATABASE {0:s}'.format(db_name)) + + def test_delete_image(self): + """Test delete image method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.delete_image(TEST_IMAGE_ID) + mock_execute.assert_called_once_with( + 'DELETE FROM images WHERE image_id = \'{0:s}\''.format(TEST_IMAGE_ID)) + def test_execute(self): """Test execute method.""" db = self._get_datastore() command = ( 'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)') with mock.patch.object(db.cursor, 'execute') as mock_execute: - db.execute(command) + db._execute(command) mock_execute.assert_called_once_with(command) + def test_get_case_images(self): + """Test get case images method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchall', + return_value=[(TEST_IMAGE_HASH, TEST_IMAGE)]): + images = db.get_case_images(TEST_CASE) + self.assertEqual(images, {TEST_IMAGE_HASH: TEST_IMAGE}) + + def test_get_filenames_from_inode(self): + """Test get filenames from inode method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchall', + return_value=[('test.txt',), ('test.txt:ads',)]): + filenames = db.get_filenames_from_inode(42, '/p1') + self.assertEqual(len(filenames), 2) + self.assertEqual(filenames[0], 'test.txt') + self.assertEqual(filenames[1], 'test.txt:ads') + + def test_get_image_cases(self): + """Test get image cases method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchall', return_value=[('test',), + ('test2',)]): + cases = db.get_image_cases(TEST_IMAGE_ID) + self.assertEqual(cases[0], 'test') + self.assertEqual(cases[1], 'test2') + + def test_get_image_hash(self): + """Test get image hash method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchone', + return_value=(TEST_IMAGE_HASH,)): + image_hash = db.get_image_hash(TEST_IMAGE_ID) + self.assertEqual(image_hash, TEST_IMAGE_HASH) + + def test_get_inodes(self): + """Test get inodes method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchall', return_value=[(10,), (19,)]): + inodes = db.get_inodes(1234, '/p1') + self.assertEqual(inodes, [10, 19]) + @mock.patch('psycopg2.connect') def test_init(self, mock_connect): """Test init method.""" @@ -63,12 +139,57 @@ class PostgresqlTest(unittest.TestCase): with self.assertRaises(RuntimeError): db = PostgresqlDataStore() + def test_initialise_database(self): + """Test initialise database method.""" + db = self._get_datastore() + calls = [ + mock.call( + 'CREATE TABLE images (image_id TEXT PRIMARY KEY, image_path TEXT, image_hash TEXT)' + ), + mock.call(( + 'CREATE TABLE image_case (' + 'case_id TEXT, image_id TEXT REFERENCES images(image_id), ' + 'PRIMARY KEY (case_id, image_id))')) + ] + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.initialise_database() + mock_execute.assert_has_calls(calls) + + def test_insert_image(self): + """Test insert image method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.insert_image(TEST_IMAGE_ID, TEST_IMAGE, TEST_IMAGE_HASH) + mock_execute.assert_called_once_with(( + 'INSERT INTO images (image_id, image_path, image_hash) ' + 'VALUES (\'{0:s}\', \'{1:s}\', \'{2:s}\')').format( + TEST_IMAGE_ID, TEST_IMAGE, TEST_IMAGE_HASH)) + + def test_is_image_in_case(self): + """Test is image in case method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'fetchone', return_value=(1,)): + result = db.is_image_in_case(TEST_IMAGE_ID, TEST_CASE) + self.assertTrue(result) + with mock.patch.object(db.cursor, 'fetchone', return_value=None): + result = db.is_image_in_case(TEST_IMAGE_ID, TEST_CASE) + self.assertFalse(result) + + def test_link_image_to_case(self): + """Test link image to case method.""" + db = self._get_datastore() + with mock.patch.object(db.cursor, 'execute') as mock_execute: + db.link_image_to_case(TEST_IMAGE_ID, TEST_CASE) + mock_execute.assert_called_once_with(( + 'INSERT INTO image_case (case_id, image_id) ' + 'VALUES (\'{0:s}\', \'{1:s}\')').format(TEST_CASE, TEST_IMAGE_ID)) + def test_query(self): """Test query method.""" db = self._get_datastore() query = 'SELECT filename FROM files WHERE inum = 0' with mock.patch.object(db.cursor, 'fetchall', return_value=[('$MFT',)]): - results = db.query(query) + results = db._query(query) self.assertEqual(results, [('$MFT',)]) @@ -79,7 +200,7 @@ class PostgresqlTest(unittest.TestCase): 'SELECT 1 from image_case WHERE image_hash = ' '\'d41d8cd98f00b204e9800998ecf8427e\'') with mock.patch.object(db.cursor, 'fetchone', return_value=(1,)): - results = db.query_single_row(query) + results = db._query_single_row(query) self.assertEqual(results, (1,)) diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index 6901d7c..64ae2ea 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -100,7 +100,8 @@ def main(): log.error('Image must be supplied for processing.') sys.exit(1) image_processor_options = ImageProcessorOptions( - not args.no_base64, not args.no_gzip, not args.no_zip, args.reindex) + not args.no_base64, not args.no_gzip, not args.no_zip, args.reparse, + args.reindex, args.delete) image_processor = ImageProcessor( args.case, image_id, os.path.abspath(args.image), image_processor_options, args.config) @@ -134,9 +135,16 @@ def parse_args(): '--no_gzip', help='don\'t decompress gzip', action='store_true') parser.add_argument( '--no_zip', help='don\'t decompress zip', action='store_true') + parser.add_argument( + '--reparse', + help='reparse filesystem (will delete existing filesystem mapping)', + action='store_true') parser.add_argument( '--reindex', help='recreate index (will delete existing index)', action='store_true') + parser.add_argument( + '--delete', help='delete image (filesystem mapping and index)', + action='store_true') # Search args parser.add_argument( diff --git a/dfdewey/utils/image_processor.py b/dfdewey/utils/image_processor.py index 21f2149..39c96b0 100644 --- a/dfdewey/utils/image_processor.py +++ b/dfdewey/utils/image_processor.py @@ -290,7 +290,7 @@ class ImageProcessor(): self.case = case self.config = dfdewey_config.load_config(config_file=config_file) self.opensearch = None - self.image_hash = None + self.image_hash = image_id self.image_id = image_id self.image_path = image_path self.options = options @@ -313,7 +313,7 @@ class ImageProcessor(): image_exists = False if not tables_exist: - self._initialise_database() + self.postgresql.initialise_database() else: image_exists = self.postgresql.value_exists( 'images', 'image_id', self.image_id) @@ -322,39 +322,89 @@ class ImageProcessor(): # case. image_case_exists = False if image_exists: - image_case = self.postgresql.query_single_row(( - 'SELECT 1 from image_case ' - 'WHERE image_id = \'{0:s}\' AND case_id = \'{1:s}\'').format( - self.image_id, self.case)) - if image_case: - image_case_exists = True + image_case_exists = self.postgresql.is_image_in_case( + self.image_id, self.case) else: - self.postgresql.execute(( - 'INSERT INTO images (image_id, image_path, image_hash) ' - 'VALUES (\'{0:s}\', \'{1:s}\', \'{2:s}\')').format( - self.image_id, self.image_path, self.image_hash)) + self.postgresql.insert_image( + self.image_id, self.image_path, self.image_hash) if not image_case_exists: - self.postgresql.execute(( - 'INSERT INTO image_case (case_id, image_id) ' - 'VALUES (\'{0:s}\', \'{1:s}\')').format(self.case, self.image_id)) + self.postgresql.link_image_to_case(self.image_id, self.case) return image_exists - def _create_filesystem_database(self): - """Create a filesystem database for the image.""" - self.postgresql.execute(( - 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, ' - 'PRIMARY KEY (block, inum, part))')) - self.postgresql.execute(( - 'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, ' - 'PRIMARY KEY (inum, filename, part))')) + def _connect_opensearch_datastore(self): + """Connect to the Opensearch datastore.""" + if self.config: + self.opensearch = OpenSearchDataStore( + host=self.config.OS_HOST, port=self.config.OS_PORT, + url=self.config.OS_URL) + else: + self.opensearch = OpenSearchDataStore() + + def _connect_postgresql_datastore(self): + """Connect to the PostgreSQL datastore.""" + if self.config: + self.postgresql = PostgresqlDataStore( + host=self.config.PG_HOST, port=self.config.PG_PORT, + db_name=self.config.PG_DB_NAME, autocommit=True) + else: + self.postgresql = PostgresqlDataStore(autocommit=True) + + def _delete_image_data(self): + """Delete image data. + + Delete filesystem database and index for the image. + """ + self._connect_postgresql_datastore() + # Check if image is linked to case + image_in_case = self.postgresql.is_image_in_case(self.image_id, self.case) + if not image_in_case: + log.error( + 'Image {0:s} does not exist in case {1:s}.'.format( + self.image_path, self.case)) + return + + # Unlink image from case + log.info( + 'Removing image {0:s} from case {1:s}'.format( + self.image_path, self.case)) + self.postgresql.unlink_image_from_case(self.image_id, self.case) + + # Check if image is linked to other cases + cases = self.postgresql.get_image_cases(self.image_id) + if cases: + log.warning( + 'Not deleting image {0:s} data. Still linked to cases: {1!s}'.format( + self.image_path, cases)) + return + + # Delete the image data + index_name = ''.join(('es', self.image_hash)) + self._connect_opensearch_datastore() + index_exists = self.opensearch.index_exists(index_name) + if index_exists: + log.info('Deleting index {0:s}.'.format(index_name)) + self.opensearch.delete_index(index_name) + else: + log.info('Index {0:s} does not exist.'.format(index_name)) + + db_name = ''.join(('fs', self.image_hash)) + log.info('Deleting database {0:s}.'.format(db_name)) + self.postgresql.delete_filesystem_database(db_name) + + # Remove the image from the database + self.postgresql.delete_image(self.image_id) + log.info( + 'Image {0:s} data has been removed from the datastores.'.format( + self.image_path)) def _extract_strings(self): """String extraction. Extract strings from the image using bulk_extractor. """ + self.output_path = tempfile.mkdtemp() cmd = [ 'bulk_extractor', '-o', self.output_path, '-x', 'all', '-e', 'wordlist' ] @@ -371,11 +421,9 @@ class ImageProcessor(): log.info('Running bulk_extractor: [%s]', ' '.join(cmd)) try: - output = subprocess.check_output(cmd) + subprocess.check_call(cmd) except subprocess.CalledProcessError as e: raise RuntimeError('String extraction failed.') from e - md5_offset = output.index(b'MD5') + 19 - self.image_hash = output[md5_offset:md5_offset + 32].decode('utf-8') def _get_volume_details(self, path_spec): """Logs volume details for the given path spec. @@ -435,12 +483,7 @@ class ImageProcessor(): def _index_strings(self): """Index the extracted strings.""" - if self.config: - self.opensearch = OpenSearchDataStore( - host=self.config.OS_HOST, port=self.config.OS_PORT, - url=self.config.OS_URL) - else: - self.opensearch = OpenSearchDataStore() + self._connect_opensearch_datastore() index_name = ''.join(('es', self.image_hash)) index_exists = self.opensearch.index_exists(index_name) if index_exists: @@ -488,40 +531,30 @@ class ImageProcessor(): records = self.opensearch.import_event(index_name) log.info('Indexed %d records...', records) - def _initialise_database(self): - """Initialse the image database.""" - self.postgresql.execute(( - 'CREATE TABLE images (image_id TEXT PRIMARY KEY, image_path TEXT, ' - 'image_hash TEXT)')) - - self.postgresql.execute(( - 'CREATE TABLE image_case (' - 'case_id TEXT, image_id TEXT REFERENCES images(image_id), ' - 'PRIMARY KEY (case_id, image_id))')) - def _parse_filesystems(self): """Filesystem parsing. Parse each filesystem to create a mapping from byte offsets to files. """ - if self.config: - self.postgresql = PostgresqlDataStore( - host=self.config.PG_HOST, port=self.config.PG_PORT, - db_name=self.config.PG_DB_NAME, autocommit=True) - else: - self.postgresql = PostgresqlDataStore(autocommit=True) - if self._already_parsed(): + self._connect_postgresql_datastore() + already_parsed = self._already_parsed() + db_name = ''.join(('fs', self.image_hash)) + if already_parsed: log.info('Image already parsed: [%s]', self.image_path) - else: - db_name = ''.join(('fs', self.image_hash)) - self.postgresql.execute('CREATE DATABASE {0:s}'.format(db_name)) + if self.options.reparse: + log.info('Reparsing.') + self.postgresql.delete_filesystem_database(db_name) + log.info('Database %s deleted.', db_name) + already_parsed = False + if not already_parsed: + self.postgresql.create_database(db_name) if self.config: self.postgresql.switch_database( host=self.config.PG_HOST, port=self.config.PG_PORT, db_name=db_name) else: self.postgresql.switch_database(db_name=db_name) - self._create_filesystem_database() + self.postgresql.create_filesystem_database() # Scan image for volumes options = volume_scanner.VolumeScannerOptions() @@ -588,18 +621,21 @@ class ImageProcessor(): def process_image(self): """Process the image.""" - self.output_path = tempfile.mkdtemp() - log.info('* Processing start: %s', datetime.now()) - self._extract_strings() - log.info('String extraction complete.') + if self.options.delete: + log.info('* Deleting image data: %s', datetime.now()) + self._delete_image_data() + else: + log.info('* Parsing image: %s', datetime.now()) + self._parse_filesystems() + log.info('Parsing complete.') - log.info('* Parsing image: %s', datetime.now()) - self._parse_filesystems() - log.info('Parsing complete.') + log.info('* Extracting strings: %s', datetime.now()) + self._extract_strings() + log.info('String extraction complete.') - log.info('* Indexing strings: %s', datetime.now()) - self._index_strings() - log.info('Indexing complete.') + log.info('* Indexing strings: %s', datetime.now()) + self._index_strings() + log.info('Indexing complete.') log.info('* Processing complete: %s', datetime.now()) @@ -613,10 +649,14 @@ class ImageProcessorOptions(): unzip (bool): decompress zip. """ - def __init__(self, base64=True, gunzip=True, unzip=True, reindex=False): + def __init__( + self, base64=True, gunzip=True, unzip=True, reparse=False, reindex=False, + delete=False): """Initialise image processor options.""" super().__init__() self.base64 = base64 self.gunzip = gunzip self.unzip = unzip + self.reparse = reparse self.reindex = reindex + self.delete = delete diff --git a/dfdewey/utils/image_processor_test.py b/dfdewey/utils/image_processor_test.py index a087547..9478cf8 100644 --- a/dfdewey/utils/image_processor_test.py +++ b/dfdewey/utils/image_processor_test.py @@ -79,10 +79,8 @@ class ImageProcessorTest(unittest.TestCase): image_processor.image_hash = TEST_IMAGE_HASH return image_processor - @mock.patch( - 'dfdewey.utils.image_processor.ImageProcessor._initialise_database') @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore') - def test_already_parsed(self, mock_postgresql, mock_initialise_database): + def test_already_parsed(self, mock_postgresql): """Test already parsed method.""" image_processor = self._get_image_processor() @@ -91,78 +89,91 @@ class ImageProcessorTest(unittest.TestCase): image_processor.postgresql = mock_postgresql result = image_processor._already_parsed() - mock_initialise_database.assert_called_once() - calls = [ - mock.call(( - 'INSERT INTO images (image_id, image_path, image_hash) ' - 'VALUES (\'{0:s}\', \'{1:s}\', \'{2:s}\')').format( - TEST_IMAGE_ID, TEST_IMAGE, TEST_IMAGE_HASH)), - mock.call(( - 'INSERT INTO image_case (case_id, image_id) ' - 'VALUES (\'{0:s}\', \'{1:s}\')').format(TEST_CASE, TEST_IMAGE_ID)) - ] - mock_postgresql.execute.assert_has_calls(calls) + mock_postgresql.initialise_database.assert_called_once() + mock_postgresql.insert_image.assert_called_once_with( + TEST_IMAGE_ID, TEST_IMAGE, TEST_IMAGE_HASH) + mock_postgresql.link_image_to_case.assert_called_once_with( + TEST_IMAGE_ID, TEST_CASE) self.assertEqual(result, False) # Test database exists, image already in case mock_postgresql.table_exists.return_value = True mock_postgresql.value_exists.return_value = True - mock_postgresql.query_single_row.return_value = (1,) - mock_postgresql.execute.reset_mock() + mock_postgresql.is_image_in_case.return_value = True + mock_postgresql.link_image_to_case.reset_mock() image_processor.postgresql = mock_postgresql result = image_processor._already_parsed() - mock_postgresql.execute.assert_not_called() + mock_postgresql.link_image_to_case.assert_not_called() self.assertEqual(result, True) # Test database exists, image exists, but not in case - mock_postgresql.query_single_row.return_value = None + mock_postgresql.is_image_in_case.return_value = False image_processor.postgresql = mock_postgresql result = image_processor._already_parsed() - mock_postgresql.execute.assert_called_once_with(( - 'INSERT INTO image_case (case_id, image_id) ' - 'VALUES (\'{0:s}\', \'{1:s}\')').format(TEST_CASE, TEST_IMAGE_ID)) + mock_postgresql.link_image_to_case.assert_called_once_with( + TEST_IMAGE_ID, TEST_CASE) self.assertEqual(result, True) + @mock.patch( + 'dfdewey.utils.image_processor.ImageProcessor._connect_opensearch_datastore' + ) + @mock.patch( + 'dfdewey.utils.image_processor.ImageProcessor._connect_postgresql_datastore' + ) + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore') @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore') - def test_create_filesystem_database(self, mock_postgresql): - """Test create filesystem database method.""" + def test_delete_image_data( + self, mock_postgresql, mock_opensearch, mock_connect_postgres, + mock_connect_opensearch): + """Test delete image data method.""" image_processor = self._get_image_processor() image_processor.postgresql = mock_postgresql - image_processor._create_filesystem_database() + image_processor.opensearch = mock_opensearch + # Test if image is not in case + mock_postgresql.is_image_in_case.return_value = False + image_processor._delete_image_data() + mock_connect_postgres.assert_called_once() + mock_postgresql.unlink_image_from_case.assert_not_called() - calls = [ - mock.call(( - 'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, ' - 'PRIMARY KEY (block, inum, part))')), - mock.call(( - 'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, ' - 'PRIMARY KEY (inum, filename, part))')) - ] - mock_postgresql.execute.assert_has_calls(calls) + # Test if image is linked to multiple cases + mock_postgresql.is_image_in_case.return_value = True + mock_postgresql.get_image_cases.return_value = ['test'] + image_processor._delete_image_data() + mock_postgresql.get_image_cases.assert_called_once() + mock_connect_opensearch.assert_not_called() - @mock.patch('subprocess.check_output') - def test_extract_strings(self, mock_subprocess): + # Test if index exists + mock_postgresql.get_image_cases.return_value = None + mock_opensearch.index_exists.return_value = True + image_processor._delete_image_data() + mock_opensearch.delete_index.assert_called_once() + mock_postgresql.delete_filesystem_database.assert_called_once() + mock_postgresql.delete_image.assert_called_once() + + # Test if index doesn't exist + mock_opensearch.delete_index.reset_mock() + mock_opensearch.index_exists.return_value = False + image_processor._delete_image_data() + mock_opensearch.delete_index.assert_not_called() + + @mock.patch('tempfile.mkdtemp') + @mock.patch('subprocess.check_call') + def test_extract_strings(self, mock_subprocess, mock_mkdtemp): """Test extract strings method.""" image_processor = self._get_image_processor() - image_processor.output_path = '/tmp/tmpxaemz75r' - image_processor.image_hash = None + mock_mkdtemp.return_value = '/tmp/tmpxaemz75r' # Test with default options - mock_subprocess.return_value = 'MD5 of Disk Image: {0:s}'.format( - TEST_IMAGE_HASH).encode('utf-8') image_processor._extract_strings() mock_subprocess.assert_called_once_with([ 'bulk_extractor', '-o', '/tmp/tmpxaemz75r', '-x', 'all', '-e', 'wordlist', '-e', 'base64', '-e', 'gzip', '-e', 'zip', '-S', 'strings=YES', '-S', 'word_max=1000000', TEST_IMAGE ]) - self.assertEqual(image_processor.image_hash, TEST_IMAGE_HASH) # Test options mock_subprocess.reset_mock() - mock_subprocess.return_value = 'MD5 of Disk Image: {0:s}'.format( - TEST_IMAGE_HASH).encode('utf-8') image_processor.options.base64 = False image_processor.options.gunzip = False image_processor.options.unzip = False @@ -264,33 +275,17 @@ class ImageProcessorTest(unittest.TestCase): self.assertEqual(mock_index_record.call_count, 3) mock_import_event.assert_called_once() - @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore') - def test_initialise_database(self, mock_postgresql): - """Test initialise database method.""" - image_processor = self._get_image_processor() - image_processor.postgresql = mock_postgresql - calls = [ - mock.call( - 'CREATE TABLE images (image_id TEXT PRIMARY KEY, image_path TEXT, image_hash TEXT)' - ), - mock.call(( - 'CREATE TABLE image_case (' - 'case_id TEXT, image_id TEXT REFERENCES images(image_id), ' - 'PRIMARY KEY (case_id, image_id))')) - ] - image_processor._initialise_database() - mock_postgresql.execute.assert_has_calls(calls) - @mock.patch('psycopg2.connect') @mock.patch('dfdewey.utils.image_processor.ImageProcessor._already_parsed') @mock.patch( 'dfdewey.datastore.postgresql.PostgresqlDataStore.switch_database') - @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore.execute') + @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore._execute') @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore.bulk_insert') def test_parse_filesystems( self, mock_bulk_insert, mock_execute, mock_switch_database, mock_already_parsed, _): """Test parse filesystems method.""" + db_name = ''.join(('fs', TEST_IMAGE_HASH)) image_processor = self._get_image_processor() # Test image already parsed @@ -298,6 +293,13 @@ class ImageProcessorTest(unittest.TestCase): image_processor._parse_filesystems() mock_execute.assert_not_called() + # Test reparse flag + image_processor.options.reparse = True + image_processor._parse_filesystems() + mock_execute.assert_any_call('DROP DATABASE {0:s}'.format(db_name)) + mock_execute.reset_mock() + mock_switch_database.reset_mock() + # Test image not parsed current_path = os.path.abspath(os.path.dirname(__file__)) image_processor.image_path = os.path.join( @@ -305,8 +307,7 @@ class ImageProcessorTest(unittest.TestCase): mock_already_parsed.return_value = False image_processor._parse_filesystems() self.assertEqual(mock_execute.call_count, 3) - mock_switch_database.assert_called_once_with( - db_name=''.join(('fs', TEST_IMAGE_HASH))) + mock_switch_database.assert_called_once_with(db_name=db_name) self.assertIsInstance(image_processor.scanner, FileEntryScanner) self.assertEqual(len(image_processor.path_specs), 2) ntfs_path_spec = image_processor.path_specs[0] @@ -337,17 +338,20 @@ class ImageProcessorTest(unittest.TestCase): current_path, '..', '..', 'test_data', 'test.dmg') image_processor._parse_filesystems() + @mock.patch('dfdewey.utils.image_processor.ImageProcessor._delete_image_data') @mock.patch('dfdewey.utils.image_processor.ImageProcessor._parse_filesystems') @mock.patch('dfdewey.utils.image_processor.ImageProcessor._index_strings') @mock.patch('dfdewey.utils.image_processor.ImageProcessor._extract_strings') def test_process_image( - self, mock_extract_strings, mock_index_strings, mock_parse_filesystems): + self, mock_extract_strings, mock_index_strings, mock_parse_filesystems, + mock_delete_image_data): """Test process image method.""" image_processor = self._get_image_processor() image_processor.process_image() mock_extract_strings.assert_called_once() mock_index_strings.assert_called_once() mock_parse_filesystems.assert_called_once() + mock_delete_image_data.assert_not_called() if __name__ == '__main__': diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 7ce58a5..03988a1 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -91,39 +91,11 @@ class IndexSearcher(): if image != 'all': self.image = os.path.abspath(self.image) - self._get_image_hash() + image_hash = self.postgresql.get_image_hash(self.image_id) + if image_hash: + self.images[image_hash] = self.image else: - self._get_case_images() - - def _get_case_images(self): - """Get all images for the case. - - Returns: - A dictionary of the images in the case. - """ - images = self.postgresql.query(( - 'SELECT image_hash, image_path FROM image_case NATURAL JOIN images ' - 'WHERE case_id = \'{0:s}\'').format(self.case)) - for image_hash, image_path in images: - self.images[image_hash] = image_path - - def _get_filenames_from_inode(self, inode, location): - """Gets filename(s) from an inode number. - - Args: - inode: Inode number of target file - location: Partition number - - Returns: - Filename(s) of given inode or None - """ - results = self.postgresql.query(( - 'SELECT filename FROM files ' - 'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location)) - filenames = [] - for result in results: - filenames.append(result[0]) - return filenames + self.images = self.postgresql.get_case_images(self.case) def _get_filenames_from_offset(self, image_path, image_hash, offset): """Gets filename(s) given a byte offset within an image. @@ -173,14 +145,13 @@ class IndexSearcher(): except TypeError as e: log.error('Error opening image: %s', e) - inodes = self._get_inodes( + inodes = self.postgresql.get_inodes( int((offset - partition_offset) / block_size), hit_location) if inodes: - for i in inodes: - inode = i[0] + for inode in inodes: # Account for resident files - if (i[0] == 0 and + if (inode == 0 and filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT): mft_record_size_offset = 0x40 + partition_offset mft_record_size = int.from_bytes( @@ -192,39 +163,13 @@ class IndexSearcher(): inode = self._get_ntfs_resident_inode((offset - partition_offset), filesystem, mft_record_size) - inode_filenames = self._get_filenames_from_inode(inode, hit_location) + inode_filenames = self.postgresql.get_filenames_from_inode( + inode, hit_location) filename = '\n'.join(inode_filenames) filenames.append('{0:s} ({1:d})'.format(filename, inode)) return filenames - def _get_image_hash(self): - """Get an image hash from the datastore. - - Returns: - MD5 hash for the image stored in PostgreSQL. - """ - image_hash = self.postgresql.query_single_row( - 'SELECT image_hash FROM images WHERE image_id = \'{0:s}\''.format( - self.image_id)) - if image_hash: - self.images[image_hash[0]] = self.image - - def _get_inodes(self, block, location): - """Gets inode numbers for a block offset. - - Args: - block (int): block offset within the image. - location (str): Partition location / identifier. - - Returns: - Inode number(s) of the given block or None. - """ - inodes = self.postgresql.query( - ('SELECT inum FROM blocks ' - 'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location)) - return inodes - def _get_ntfs_resident_inode(self, offset, filesystem, mft_record_size): """Gets the inode number associated with NTFS $MFT resident data. diff --git a/dfdewey/utils/index_searcher_test.py b/dfdewey/utils/index_searcher_test.py index ca732cc..0684262 100644 --- a/dfdewey/utils/index_searcher_test.py +++ b/dfdewey/utils/index_searcher_test.py @@ -39,14 +39,14 @@ class IndexSearcherTest(unittest.TestCase): Test index searcher. """ with mock.patch('psycopg2.connect'), mock.patch( - 'dfdewey.datastore.postgresql.PostgresqlDataStore.query_single_row' + 'dfdewey.datastore.postgresql.PostgresqlDataStore._query_single_row' ) as mock_query_single_row: mock_query_single_row.return_value = (TEST_IMAGE_HASH,) index_searcher = IndexSearcher(TEST_CASE, TEST_IMAGE_ID, TEST_IMAGE) index_searcher.config = None return index_searcher - @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore.query') + @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore._query') def test_get_case_images(self, mock_query): """Test get case images method.""" mock_query.return_value = [( @@ -61,19 +61,10 @@ class IndexSearcherTest(unittest.TestCase): self.assertEqual(index_searcher.images['hash1'], 'image1.dd') self.assertEqual(index_searcher.images['hash2'], 'image2.dd') - @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore.query') - def test_get_filenames_from_inode(self, mock_query): - """Test get filenames from inode method.""" - index_searcher = self._get_index_searcher() - mock_query.return_value = [('test.txt',), ('test.txt:ads',)] - filenames = index_searcher._get_filenames_from_inode(42, '/p1') - self.assertEqual(len(filenames), 2) - self.assertEqual(filenames[0], 'test.txt') - self.assertEqual(filenames[1], 'test.txt:ads') - - @mock.patch('dfdewey.utils.index_searcher.IndexSearcher._get_inodes') + @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore.get_inodes') @mock.patch( - 'dfdewey.utils.index_searcher.IndexSearcher._get_filenames_from_inode') + 'dfdewey.datastore.postgresql.PostgresqlDataStore.get_filenames_from_inode' + ) @mock.patch( 'dfdewey.datastore.postgresql.PostgresqlDataStore.switch_database') def test_get_filenames_from_offset( @@ -94,7 +85,7 @@ class IndexSearcherTest(unittest.TestCase): # Test offset within a file mock_get_inodes.reset_mock() - mock_get_inodes.return_value = [(0,)] + mock_get_inodes.return_value = [0] mock_get_filenames_from_inode.return_value = ['adams.txt'] filenames = index_searcher._get_filenames_from_offset( image_path, TEST_IMAGE_HASH, 1133936) @@ -104,7 +95,7 @@ class IndexSearcherTest(unittest.TestCase): # Test volume image mock_get_inodes.reset_mock() - mock_get_inodes.return_value = [(2,)] + mock_get_inodes.return_value = [2] mock_get_filenames_from_inode.reset_mock() mock_get_filenames_from_inode.return_value = [] image_path = os.path.join( diff --git a/docker/Dockerfile b/docker/Dockerfile index 01f2803..fc30e06 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Use the official Docker Hub Ubuntu 18.04 base image -FROM ubuntu:18.04 +# Use the official Docker Hub Ubuntu 20.04 base image +FROM ubuntu:20.04 # Update the base image ENV DEBIAN_FRONTEND=noninteractive diff --git a/docs/usage.md b/docs/usage.md index ef2d1a4..5c139c0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,10 +1,7 @@ # Using dfDewey ```shell -usage: dfdewey [-h] [-c CONFIG] [--no_base64] [--no_gzip] [--no_zip] - [--reindex] [--highlight] [-s SEARCH] - [--search_list SEARCH_LIST] - case [image] +usage: dfdewey [-h] [-c CONFIG] [--no_base64] [--no_gzip] [--no_zip] [--reparse] [--reindex] [--delete] [--highlight] [-s SEARCH] [--search_list SEARCH_LIST] case [image] positional arguments: case case ID @@ -17,7 +14,9 @@ optional arguments: --no_base64 don't decode base64 --no_gzip don't decompress gzip --no_zip don't decompress zip + --reparse reparse filesystem (will delete existing filesystem mapping) --reindex recreate index (will delete existing index) + --delete delete image (filesystem mapping and index) --highlight highlight search term in results -s SEARCH, --search SEARCH search query @@ -77,6 +76,13 @@ dfDewey will have bulk_extractor decode base64 data, and decompress gzip / zip data by default. These can be disabled by adding the flags `--no_base64`, `--no_gzip`, and `--no_zip`. +If an image has already been processed, you can opt to reparse and reindex the +image (this will first delete the existing data) by adding the flags +`--reparse` and `--reindex`. + +You can also delete the data for a given image from the datastores by adding +the `--delete` flag. + ## Searching To search the index for a single image, you need to supply a `CASE`, `IMAGE`,