diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6f6896d..1898cc3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -5,8 +5,38 @@ on: types: [opened, synchronize, reopened] jobs: - build: - runs-on: ubuntu-latest + build-bionic: + runs-on: ubuntu-18.04 + strategy: + matrix: + include: + - python-version: '3.6' + - python-version: '3.7' + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + env: + DEBIAN_FRONTEND: noninteractive + run: | + sudo apt update -q + sudo apt install -y software-properties-common + + - name: Install dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + sudo add-apt-repository -y ppa:gift/stable + sudo apt update -q + sudo apt install -y python${{ matrix.python-version }} python3-dfvfs python3-pip python3-setuptools + python3 -m pip install .[dev] + + - name: Run unit tests + run: python3 run_tests.py + + build-focal: + runs-on: ubuntu-20.04 strategy: matrix: include: diff --git a/README.md b/README.md index c8589be..b66f216 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ pip install -r dfvfs_requirements.txt ``` ### Datastores -Elasticsearch and PostgreSQL are also required to store extracted data. +OpenSearch and PostgreSQL are also required to store extracted data. These can be installed separately or started in Docker using `docker-compose`. ```shell diff --git a/dfdewey/__init__.py b/dfdewey/__init__.py index 3f03266..298299c 100644 --- a/dfdewey/__init__.py +++ b/dfdewey/__init__.py @@ -17,4 +17,4 @@ dfDewey is a digital forensics string extraction, indexing, and searching tool. """ -__version__ = '20211201' +__version__ = '20211220' diff --git a/dfdewey/config/__init__.py b/dfdewey/config/__init__.py index 180f4d9..0a335ad 100644 --- a/dfdewey/config/__init__.py +++ b/dfdewey/config/__init__.py @@ -19,7 +19,7 @@ import logging import os CONFIG_ENV = [ - 'PG_HOST', 'PG_PORT', 'PG_DB_NAME', 'ES_HOST', 'ES_PORT', 'ES_URL' + 'PG_HOST', 'PG_PORT', 'PG_DB_NAME', 'OS_HOST', 'OS_PORT', 'OS_URL' ] CONFIG_FILE = '.dfdeweyrc' # Look in homedir first, then current dir @@ -51,7 +51,7 @@ def load_config(config_file=None): for config_var in CONFIG_ENV: config_env = os.environ.get('_'.join(('DFDEWEY', config_var))) if not config_env: - if config_var == 'ES_URL': + if config_var == 'OS_URL': config_str += '{0:s} = {1:s}\n'.format(config_var, 'None') break else: diff --git a/dfdewey/config/config_template.py b/dfdewey/config/config_template.py index 0ededc5..55db4fd 100644 --- a/dfdewey/config/config_template.py +++ b/dfdewey/config/config_template.py @@ -19,9 +19,9 @@ PG_HOST = '127.0.0.1' PG_PORT = 5432 PG_DB_NAME = 'dfdewey' -# Elasticsearch Config -ES_HOST = '127.0.0.1' -ES_PORT = 9200 -# ES_URL can be used to specify a RFC-1738 formatted URL -# Example: ES_URL = 'https://user:secret@127.0.0.1:9200/' -ES_URL = None +# OpenSearch Config +OS_HOST = '127.0.0.1' +OS_PORT = 9200 +# OS_URL can be used to specify a RFC-1738 formatted URL +# Example: OS_URL = 'https://user:secret@127.0.0.1:9200/' +OS_URL = None diff --git a/dfdewey/datastore/elastic.py b/dfdewey/datastore/opensearch.py similarity index 82% rename from dfdewey/datastore/elastic.py rename to dfdewey/datastore/opensearch.py index ad02611..3686ddd 100644 --- a/dfdewey/datastore/elastic.py +++ b/dfdewey/datastore/opensearch.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Google LLC +# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,15 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Elasticsearch datastore.""" +"""Opensearch datastore.""" import collections -from elasticsearch import Elasticsearch -from elasticsearch import exceptions +from opensearchpy import OpenSearch +from opensearchpy import exceptions -class ElasticsearchDataStore(): +class OpenSearchDataStore(): """Implements the datastore.""" # Number of events to queue up when bulk inserting events. @@ -28,24 +28,24 @@ class ElasticsearchDataStore(): DEFAULT_SIZE = 1000 # Max events to return def __init__(self, host='127.0.0.1', port=9200, url=None): - """Create an Elasticsearch client.""" + """Create an OpenSearch client.""" super().__init__() if url: - self.client = Elasticsearch([url], timeout=30) + self.client = OpenSearch([url], timeout=30) else: - self.client = Elasticsearch([{'host': host, 'port': port}], timeout=30) + self.client = OpenSearch([{'host': host, 'port': port}], timeout=30) self.import_counter = collections.Counter() self.import_events = [] @staticmethod def build_query(query_string): - """Build Elasticsearch DSL query. + """Build OpenSearch DSL query. Args: query_string: Query string Returns: - Elasticsearch DSL query as a dictionary + OpenSearch DSL query as a dictionary """ query_dsl = { @@ -80,7 +80,7 @@ class ElasticsearchDataStore(): return index_name def delete_index(self, index_name): - """Delete Elasticsearch index. + """Delete OpenSearch index. Args: index_name: Name of the index to delete. @@ -93,10 +93,10 @@ class ElasticsearchDataStore(): def import_event( self, index_name, event=None, flush_interval=DEFAULT_FLUSH_INTERVAL): - """Add event to Elasticsearch. + """Add event to OpenSearch. Args: - index_name: Name of the index in Elasticsearch + index_name: Name of the index in OpenSearch event: Event dictionary flush_interval: Number of events to queue up before indexing @@ -104,7 +104,7 @@ class ElasticsearchDataStore(): The number of events processed. """ if event: - # Header needed by Elasticsearch when bulk inserting. + # Header needed by OpenSearch when bulk inserting. header = {'index': {'_index': index_name}} self.import_events.append(header) @@ -133,11 +133,11 @@ class ElasticsearchDataStore(): return self.client.indices.exists(index_name) def search(self, index_id, query_string, size=DEFAULT_SIZE): - """Search ElasticSearch. + """Search OpenSearch. This will take a query string from the UI together with a filter definition. - Based on this it will execute the search request on ElasticSearch and get - the result back. + Based on this it will execute the search request on OpenSearch and get the + result back. Args: index_id: Index to be searched @@ -150,7 +150,7 @@ class ElasticsearchDataStore(): query_dsl = self.build_query(query_string) - # Default search type for elasticsearch is query_then_fetch. + # Default search type for OpenSearch is query_then_fetch. search_type = 'query_then_fetch' # pylint: disable=unexpected-keyword-arg diff --git a/dfdewey/datastore/elastic_test.py b/dfdewey/datastore/opensearch_test.py similarity index 87% rename from dfdewey/datastore/elastic_test.py rename to dfdewey/datastore/opensearch_test.py index 6623542..bf5d43a 100644 --- a/dfdewey/datastore/elastic_test.py +++ b/dfdewey/datastore/opensearch_test.py @@ -12,28 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for elasticsearch datastore.""" +"""Tests for opensearch datastore.""" import unittest import mock -from elasticsearch import exceptions +from opensearchpy import exceptions -from dfdewey.datastore.elastic import ElasticsearchDataStore +from dfdewey.datastore.opensearch import OpenSearchDataStore TEST_INDEX_NAME = ''.join(('es', 'd41d8cd98f00b204e9800998ecf8427e')) -class ElasticTest(unittest.TestCase): - """Tests for Elasticsearch datastore.""" +class OpenSearchTest(unittest.TestCase): + """Tests for OpenSearch datastore.""" def _get_datastore(self): - """Get a mock elasticsearch datastore. + """Get a mock opensearch datastore. Returns: - Mock elasticsearch datastore. + Mock opensearch datastore. """ - es = ElasticsearchDataStore() + es = OpenSearchDataStore() return es def test_build_query(self): @@ -56,8 +56,8 @@ class ElasticTest(unittest.TestCase): self.assertEqual(query, query_dsl) - @mock.patch('elasticsearch.client.IndicesClient.create') - @mock.patch('elasticsearch.client.IndicesClient.exists') + @mock.patch('opensearchpy.client.IndicesClient.create') + @mock.patch('opensearchpy.client.IndicesClient.exists') def test_create_index(self, mock_exists, mock_create): """Test create index method.""" es = self._get_datastore() @@ -71,8 +71,8 @@ class ElasticTest(unittest.TestCase): with self.assertRaises(RuntimeError): result = es.create_index(TEST_INDEX_NAME) - @mock.patch('elasticsearch.client.IndicesClient.delete') - @mock.patch('elasticsearch.client.IndicesClient.exists') + @mock.patch('opensearchpy.client.IndicesClient.delete') + @mock.patch('opensearchpy.client.IndicesClient.exists') def test_delete_index(self, mock_exists, mock_delete): """Test delete index method.""" es = self._get_datastore() @@ -131,7 +131,7 @@ class ElasticTest(unittest.TestCase): result = es.import_event(TEST_INDEX_NAME, test_event, flush_interval=1) self.assertEqual(result, 1) - @mock.patch('elasticsearch.client.IndicesClient.exists') + @mock.patch('opensearchpy.client.IndicesClient.exists') def test_index_exists(self, mock_exists): """Test index exists method.""" es = self._get_datastore() @@ -139,8 +139,8 @@ class ElasticTest(unittest.TestCase): es.index_exists(TEST_INDEX_NAME) mock_exists.assert_called_once_with(TEST_INDEX_NAME) - @mock.patch('elasticsearch.Elasticsearch.search') - @mock.patch('elasticsearch.client.IndicesClient.exists') + @mock.patch('opensearchpy.OpenSearch.search') + @mock.patch('opensearchpy.client.IndicesClient.exists') def test_search(self, mock_exists, mock_search): """Test search method.""" es = self._get_datastore() diff --git a/dfdewey/dfdcli.py b/dfdewey/dfdcli.py index c9bb381..6901d7c 100755 --- a/dfdewey/dfdcli.py +++ b/dfdewey/dfdcli.py @@ -31,7 +31,7 @@ log = logging.getLogger('dfdewey') class _StringRecord(): - """Elasticsearch string record. + """OpenSearch string record. Attributes: image: Hash to identify the source image of the string @@ -68,7 +68,10 @@ def get_image_id(image_path): with open(image_path, 'rb') as image_file: hash = hashlib.md5() hashed = 0 - while chunk := image_file.read(8192): + while True: + chunk = image_file.read(8192) + if not chunk: + break hash.update(chunk) hashed += 1 if hashed == 262144: diff --git a/dfdewey/utils/image_processor.py b/dfdewey/utils/image_processor.py index d43a461..83e95d1 100644 --- a/dfdewey/utils/image_processor.py +++ b/dfdewey/utils/image_processor.py @@ -30,7 +30,7 @@ from dfvfs.volume import tsk_volume_system import pytsk3 import dfdewey.config as dfdewey_config -from dfdewey.datastore.elastic import ElasticsearchDataStore +from dfdewey.datastore.opensearch import OpenSearchDataStore from dfdewey.datastore.postgresql import PostgresqlDataStore BATCH_SIZE = 1500 @@ -40,7 +40,7 @@ log = logging.getLogger('dfdewey.image_processor') class _StringRecord(): - """Elasticsearch string record. + """OpenSearch string record. Attributes: image: Hash to identify the source image of the string @@ -270,7 +270,7 @@ class ImageProcessor(): Attributes: case (str): case ID. - elasticsearch (ElasticsearchDataStore): elasticsearch datastore. + opensearch (OpenSearchDataStore): opensearch datastore. image_hash (str): MD5 hash of the image. image_id (str): image identifier. image_path (str): path to source image. @@ -286,7 +286,7 @@ class ImageProcessor(): super().__init__() self.case = case self.config = dfdewey_config.load_config(config_file=config_file) - self.elasticsearch = None + self.opensearch = None self.image_hash = None self.image_id = image_id self.image_path = image_path @@ -416,7 +416,7 @@ class ImageProcessor(): """Index a single record. Args: - index_name: ID of the elasticsearch index. + index_name: ID of the opensearch index. string_record: String record to be indexed. Returns: @@ -428,27 +428,27 @@ class ImageProcessor(): 'file_offset': string_record.file_offset, 'data': string_record.data } - return self.elasticsearch.import_event(index_name, event=json_record) + return self.opensearch.import_event(index_name, event=json_record) def _index_strings(self): """Index the extracted strings.""" if self.config: - self.elasticsearch = ElasticsearchDataStore( + self.OpenSearch = OpenSearchDataStore( host=self.config.ES_HOST, port=self.config.ES_PORT, url=self.config.ES_URL) else: - self.elasticsearch = ElasticsearchDataStore() + self.opensearch = OpenSearchDataStore() index_name = ''.join(('es', self.image_hash)) - index_exists = self.elasticsearch.index_exists(index_name) + index_exists = self.opensearch.index_exists(index_name) if index_exists: log.info('Image already indexed: [%s]', self.image_path) if self.options.reindex: log.info('Reindexing.') - self.elasticsearch.delete_index(index_name) + self.opensearch.delete_index(index_name) log.info('Index %s deleted.', index_name) index_exists = False if not index_exists: - index_name = self.elasticsearch.create_index(index_name=index_name) + index_name = self.opensearch.create_index(index_name=index_name) log.info('Index %s created.', index_name) string_list = os.path.join(self.output_path, 'wordlist.txt') @@ -482,7 +482,7 @@ class ImageProcessor(): if records % STRING_INDEXING_LOG_INTERVAL == 0: log.info('Indexed %d records...', records) # Flush the import buffer - records = self.elasticsearch.import_event(index_name) + records = self.opensearch.import_event(index_name) log.info('Indexed %d records...', records) def _initialise_database(self): diff --git a/dfdewey/utils/image_processor_test.py b/dfdewey/utils/image_processor_test.py index 4bc884f..a087547 100644 --- a/dfdewey/utils/image_processor_test.py +++ b/dfdewey/utils/image_processor_test.py @@ -201,8 +201,8 @@ class ImageProcessorTest(unittest.TestCase): self.assertEqual(location, '/p1') self.assertEqual(start_offset, 1048576) - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore') - def test_index_record(self, mock_elasticsearch): + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore') + def test_index_record(self, mock_opensearch): """Test index record method.""" image_processor = self._get_image_processor() @@ -212,7 +212,7 @@ class ImageProcessorTest(unittest.TestCase): string_record.offset = 1234567 string_record.data = 'test string' - image_processor.elasticsearch = mock_elasticsearch + image_processor.opensearch = mock_opensearch image_processor._index_record(index_name, string_record) json_record = { @@ -221,14 +221,14 @@ class ImageProcessorTest(unittest.TestCase): 'file_offset': string_record.file_offset, 'data': string_record.data } - mock_elasticsearch.import_event.assert_called_once_with( + mock_opensearch.import_event.assert_called_once_with( index_name, event=json_record) - @mock.patch('elasticsearch.client.IndicesClient') + @mock.patch('opensearchpy.client.IndicesClient') @mock.patch('dfdewey.utils.image_processor.ImageProcessor._index_record') - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore.index_exists') - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore.import_event') - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore.create_index') + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore.index_exists') + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore.import_event') + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore.create_index') def test_index_strings( self, mock_create_index, mock_import_event, mock_index_exists, mock_index_record, _): diff --git a/dfdewey/utils/index_searcher.py b/dfdewey/utils/index_searcher.py index 6d9bade..4d3b640 100644 --- a/dfdewey/utils/index_searcher.py +++ b/dfdewey/utils/index_searcher.py @@ -24,7 +24,7 @@ import pytsk3 from tabulate import tabulate import dfdewey.config as dfdewey_config -from dfdewey.datastore.elastic import ElasticsearchDataStore +from dfdewey.datastore.opensearch import OpenSearchDataStore from dfdewey.datastore.postgresql import PostgresqlDataStore from dfdewey.utils.image_processor import FileEntryScanner @@ -71,7 +71,7 @@ class IndexSearcher(): super().__init__() self.case = case self.config = dfdewey_config.load_config(config_file) - self.elasticsearch = None + self.opensearch = None self.image = image self.image_id = image_id self.images = {} @@ -82,12 +82,12 @@ class IndexSearcher(): self.postgresql = PostgresqlDataStore( host=self.config.PG_HOST, port=self.config.PG_PORT, db_name=self.config.PG_DB_NAME) - self.elasticsearch = ElasticsearchDataStore( + self.opensearch = OpenSearchDataStore( host=self.config.ES_HOST, port=self.config.ES_PORT, url=self.config.ES_URL) else: self.postgresql = PostgresqlDataStore() - self.elasticsearch = ElasticsearchDataStore() + self.opensearch = OpenSearchDataStore() if image != 'all': self.image = os.path.abspath(self.image) @@ -331,7 +331,7 @@ class IndexSearcher(): table_data = [] for term in search_terms: term = ''.join(('"', term.strip(), '"')) - results = self.elasticsearch.search(index, term) + results = self.opensearch.search(index, term) hit_count = results['hits']['total']['value'] if hit_count > 0: table_data.append({'Search term': term, 'Hits': hit_count}) @@ -353,7 +353,7 @@ class IndexSearcher(): for image_hash, image_path in self.images.items(): log.info('Searching %s (%s) for "%s"', image_path, image_hash, query) index = ''.join(('es', image_hash)) - results = self.elasticsearch.search(index, query) + results = self.opensearch.search(index, query) result_count = results['hits']['total']['value'] time_taken = results['took'] diff --git a/dfdewey/utils/index_searcher_test.py b/dfdewey/utils/index_searcher_test.py index 0a18c6f..ca732cc 100644 --- a/dfdewey/utils/index_searcher_test.py +++ b/dfdewey/utils/index_searcher_test.py @@ -142,7 +142,7 @@ class IndexSearcherTest(unittest.TestCase): ]) @mock.patch('logging.Logger.info') - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore.search') + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore.search') def test_list_search(self, mock_search, mock_output): """Test list search.""" index_searcher = self._get_index_searcher() @@ -167,7 +167,7 @@ class IndexSearcherTest(unittest.TestCase): @mock.patch('logging.Logger.info') @mock.patch('dfdewey.datastore.postgresql.PostgresqlDataStore') - @mock.patch('dfdewey.datastore.elastic.ElasticsearchDataStore.search') + @mock.patch('dfdewey.datastore.opensearch.OpenSearchDataStore.search') def test_search(self, mock_search, mock_postgresql, mock_output): """Test search method.""" index_searcher = self._get_index_searcher() diff --git a/dfvfs_requirements.txt b/dfvfs_requirements.txt index e51a96a..49dd28d 100644 --- a/dfvfs_requirements.txt +++ b/dfvfs_requirements.txt @@ -1,9 +1,9 @@ -dfvfs >= 20211107 +dfvfs >= 20211017 pip >= 7.0.0 PyYAML >= 3.10 cffi >= 1.9.1 cryptography >= 2.0.2 -dfdatetime >= 20211113 +dfdatetime >= 20210509 dtfabric >= 20170524 libbde-python >= 20140531 libewf-python >= 20131210 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b3b8ee7..938fef0 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -11,28 +11,38 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -version: '2' +version: '3' services: - elasticsearch: + opensearch: + image: opensearchproject/opensearch:latest environment: + # Set to single node deployment - discovery.type=single-node - # Java memory for Elasticsearch is set high for better performance when + # Disabling SSL for localhost only deployment + - plugins.security.disabled=true + # Java memory for Opensearch is set high for better performance when # indexing large volumes of data. # If running on a system with less available memory, consider using # something smaller, such as: - # - ES_JAVA_OPTS=-Xms512m -Xmx512m - - ES_JAVA_OPTS=-Xms32g -Xmx32g - image: elasticsearch:7.9.3 + # - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + # Recommend setting to 50% of system RAM + - "OPENSEARCH_JAVA_OPTS=-Xms32g -Xmx32g" + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 ports: - "127.0.0.1:9200:9200" - - "127.0.0.1:9300:9300" restart: always postgres: - image: postgres - ports: - - "127.0.0.1:5432:5432" + image: postgres:latest environment: - POSTGRES_USER=dfdewey - POSTGRES_PASSWORD=password + ports: + - "127.0.0.1:5432:5432" restart: always diff --git a/docs/usage.md b/docs/usage.md index 305c926..ef2d1a4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,7 +27,7 @@ optional arguments: ## Docker -If using Elasticsearch and PostgreSQL in Docker, they can be started using +If using OpenSearch and PostgreSQL in Docker, they can be started using [docker-compose](https://docs.docker.com/compose/install/) from the `docker` folder. @@ -35,7 +35,7 @@ folder. docker-compose up -d ``` -Note: Java memory for Elasticsearch is set high to improve performance when +Note: Java memory for OpenSearch is set high to improve performance when indexing large volumes of data. If running on a system with limited resources, you can change the setting in `docker/docker-compose.yml`. @@ -57,7 +57,7 @@ docker build -t -f ./docker/Dockerfile . ``` When running dfDewey within a Docker container, we need to give the container -access to the host network so it will be able to access Elasticsearch and +access to the host network so it will be able to access OpenSearch and PostgreSQL in their respective containers. We also need to map a folder in the container to allow access to the image we want to process. For example: diff --git a/requirements.txt b/requirements.txt index 9db584d..d46c314 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -elasticsearch +opensearch-py psycopg2-binary six tabulate diff --git a/setup.py b/setup.py index f218dbe..6c452b2 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ setup( name='dfDewey', version=dfdewey.__version__, description=DFDEWEY_DESCRIPTION, + long_description=DFDEWEY_DESCRIPTION, license='Apache License, Version 2.0', url='https://github.com/google/dfdewey', maintainer='dfDewey development team',