Merge pull request #5 from google/refactor

Refactor CLI, processing, and searching
2020-11-20 15:07:20 +11:00 · 2020-11-20 15:07:20 +11:00 · 7ac54eccc2
commit 7ac54eccc2
parent 14241b04f3 6dba7383e5
11 changed files with 1029 additions and 654 deletions
--- a/dfdewey/datastore/elastic.py
+++ b/dfdewey/datastore/elastic.py
@ -16,16 +16,11 @@

 import codecs
 import collections
-import logging

 from elasticsearch import Elasticsearch
 from elasticsearch import exceptions
 import six

-# Setup logging
-es_logger = logging.getLogger('dfdewey.elasticsearch')
-es_logger.setLevel(logging.WARNING)
-

 class ElasticsearchDataStore():
  """Implements the datastore."""
@ -138,6 +133,17 @@ class ElasticsearchDataStore():

    return self.import_counter['events']

+  def index_exists(self, index_name):
+    """Check if an index already exists.
+
+    Args:
+      index_name: Name of the index
+
+    Returns:
+      True if the index exists, False if not.
+    """
+    return self.client.indices.exists(index_name)
+
  def search(self, index_id, query_string, size=DEFAULT_SIZE):
    """Search ElasticSearch.

--- a/dfdewey/datastore/postgresql.py
+++ b/dfdewey/datastore/postgresql.py
@ -14,15 +14,9 @@
 # limitations under the License.
 """PostgreSQL datastore."""

-import logging
-
 import psycopg2
 from psycopg2 import extras

-# Setup logging
-postgresql_logger = logging.getLogger('dfdewey.postgresql')
-postgresql_logger.setLevel(logging.WARNING)
-

 class PostgresqlDataStore():
  """Implements the datastore."""
@ -31,9 +25,12 @@ class PostgresqlDataStore():
      self, host='127.0.0.1', port=5432, db_name='dfdewey', autocommit=False):
    """Create a PostgreSQL client."""
    super().__init__()
-    self.db = psycopg2.connect(
-        database=db_name, user='dfdewey', password='password', host=host,
-        port=port)
+    try:
+      self.db = psycopg2.connect(
+          database=db_name, user='dfdewey', password='password', host=host,
+          port=port)
+    except psycopg2.OperationalError as e:
+      raise RuntimeError('Unable to connect to PostgreSQL.') from e
    if autocommit:
      self.db.set_isolation_level(
          psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
@ -41,9 +38,11 @@ class PostgresqlDataStore():

  def __del__(self):
    """Finalise a PostgreSQL client."""
-    if self.db:
+    try:
      self.db.commit()
      self.db.close()
+    except AttributeError:
+      pass

  def bulk_insert(self, table_spec, rows):
    """Execute a bulk insert into a table.
@ -53,7 +52,9 @@ class PostgresqlDataStore():
      rows: Array of value tuples to be inserted
    """
    extras.execute_values(
-        self.cursor, 'INSERT INTO {0:s} VALUES %s'.format(table_spec), rows)
+        self.cursor,
+        'INSERT INTO {0:s} VALUES %s ON CONFLICT DO NOTHING'.format(table_spec),
+        rows)

  def execute(self, command):
    """Execute a command in the PostgreSQL database.
--- a/dfdewey/datastore/postgresql_test.py
+++ b/dfdewey/datastore/postgresql_test.py
@ -40,7 +40,7 @@ class PostgresqlTest(unittest.TestCase):
    rows = [(1, 1), (2, 2), (3, 3)]
    db.bulk_insert('blocks (block, inum)', rows)

-    expected_sql = 'INSERT INTO blocks (block, inum) VALUES %s'
+    expected_sql = 'INSERT INTO blocks (block, inum) VALUES %s ON CONFLICT DO NOTHING'
    mock_execute_values.assert_called_once_with(db.cursor, expected_sql, rows)

  def test_execute(self):
--- a/dfdewey/dfdcli.py
+++ b/dfdewey/dfdcli.py
@ -16,19 +16,20 @@
 """DFDewey Command-Line Interface."""

 import argparse
-import datetime
+import logging
 import os
-import subprocess
-import tempfile
+import sys

-from dfdewey.datastore.elastic import ElasticsearchDataStore
-from dfdewey.datastore.postgresql import PostgresqlDataStore
-from dfdewey.utils import image
+from dfdewey.utils.image_processor import ImageProcessorOptions, ImageProcessor
+from dfdewey.utils.index_searcher import IndexSearcher

 STRING_INDEXING_LOG_INTERVAL = 10000000

+# Setup logging
+log = logging.getLogger('dfdewey')

-class _StringRecord(object):
+
+class _StringRecord():
  """Elasticsearch string record.

  Attributes:
@ -46,6 +47,30 @@ class _StringRecord(object):
    self.data = ''


+def main():
+  """Main DFDewey function."""
+  args = parse_args()
+
+  setup_logging()
+
+  if not args.search and not args.search_list:
+    # Processing an image since no search terms specified
+    if args.image == 'all':
+      log.error('Image must be supplied for processing.')
+      sys.exit(1)
+    image_processor_options = ImageProcessorOptions(
+        not args.no_base64, not args.no_gzip, not args.no_zip)
+    image_processor = ImageProcessor(
+        args.case, os.path.abspath(args.image), image_processor_options)
+    image_processor.process_image()
+  else:
+    index_searcher = IndexSearcher(args.case, args.image)
+    if args.search:
+      index_searcher.search(args.search)
+    elif args.search_list:
+      index_searcher.list_search(args.search_list)
+
+
 def parse_args():
  """Argument parsing function.

@ -54,8 +79,9 @@ def parse_args():
  """
  parser = argparse.ArgumentParser()

-  parser.add_argument('-c', '--case', required=True, help='case ID')
-  parser.add_argument('-i', '--image', help='image file')
+  parser.add_argument('case', help='case ID')
+  parser.add_argument(
+      'image', nargs='?', default='all', help='image file (default: \'all\')')

  # Indexing args
  parser.add_argument(
@ -73,213 +99,17 @@ def parse_args():
  return args


-def process_image(image_file, case, base64, gunzip, unzip):
-  """Image processing function.
+def setup_logging():
+  """Configure the logger."""
+  log.propagate = False
+  log.setLevel(logging.INFO)

-  Run string extraction, indexing, and filesystem parsing for image file.
-
-  Args:
-    image_file: The image file to be processed
-    case: Case ID
-    base64: Flag to decode Base64
-    gunzip: Flag to decompress gzip
-    unzip: Flag to decompress zip
-  """
-  image_path = os.path.abspath(image_file)
-  output_path = tempfile.mkdtemp()
-
-  cmd = ['bulk_extractor', '-o', output_path, '-x', 'all', '-e', 'wordlist']
-
-  if base64:
-    cmd.extend(['-e', 'base64'])
-  if gunzip:
-    cmd.extend(['-e', 'gzip'])
-  if unzip:
-    cmd.extend(['-e', 'zip'])
-
-  cmd.extend(['-S', 'strings=YES', '-S', 'word_max=1000000'])
-  cmd.extend([image_path])
-
-  print('Processing start: {0!s}'.format(datetime.datetime.now()))
-
-  print('\n*** Running bulk extractor:\n{0:s}'.format(' '.join(cmd)))
-  output = subprocess.check_output(cmd)
-  md5_offset = output.index(b'MD5') + 19
-  image_hash = output[md5_offset:md5_offset + 32].decode('utf-8')
-  print('String extraction completed: {0!s}'.format(datetime.datetime.now()))
-
-  print('\n*** Parsing image')
-  image_already_processed = image.initialise_block_db(
-      image_path, image_hash, case)
-  print('Parsing completed: {0!s}'.format(datetime.datetime.now()))
-
-  if not image_already_processed:
-    print('\n*** Indexing image')
-    index_strings(output_path, image_hash)
-    print('Indexing completed: {0!s}'.format(datetime.datetime.now()))
-  else:
-    print('\n*** Image already indexed')
-
-  print('Processing complete!')
-
-
-def index_strings(output_path, image_hash):
-  """ElasticSearch indexing function.
-
-  Args:
-    output_path: The output directory from bulk_extractor
-    image_hash: MD5 of the parsed image
-  """
-  print('\n*** Indexing data...')
-  es = ElasticsearchDataStore()
-  index_name = ''.join(('es', image_hash))
-  index_name = es.create_index(index_name=index_name)
-  print('Index {0:s} created.'.format(index_name))
-
-  string_list = os.path.join(output_path, 'wordlist.txt')
-  with open(string_list, 'r') as strings:
-    for line in strings:
-      if not line.startswith('#'):
-        string_record = _StringRecord()
-        string_record.image = image_hash
-
-        line = line.split('\t')
-        offset = line[0]
-        data = '\t'.join(line[1:])
-        if offset.find('-') > 0:
-          offset = offset.split('-')
-          image_offset = offset[0]
-          file_offset = '-'.join(offset[1:])
-          string_record.offset = int(image_offset)
-          string_record.file_offset = file_offset
-        else:
-          string_record.offset = int(offset)
-
-        string_record.data = data
-        records = index_record(es, index_name, string_record)
-        if records % STRING_INDEXING_LOG_INTERVAL == 0:
-          print('Indexed {0:d} records...'.format(records))
-
-  records = es.import_event(index_name)
-  print('\n*** Indexed {0:d} strings.'.format(records))
-
-
-def index_record(es, index_name, string_record):
-  """Index a single record.
-
-  Args:
-    es: Elasticsearch datastore
-    index_name: ID of the elasticsearch index
-    string_record: String record to be indexed
-
-  Returns:
-    Number of records processed
-  """
-  json_record = {
-      'image': string_record.image,
-      'offset': string_record.offset,
-      'file_offset': string_record.file_offset,
-      'data': string_record.data
-  }
-  return es.import_event(index_name, event=json_record)
-
-
-def search(query, case, image_path=None, query_list=None):
-  """Search function.
-
-  Searches either the index for a single image, or indexes for all images
-  in a given case if no image_path is specified.
-
-  Args:
-    query: The query to run against the index
-    case: The case to query (if no specific image is provided)
-    image_path: Optional path of the source image
-    query_list: Path to a text file containing multiple search terms
-  """
-  case_db = PostgresqlDataStore()
-  images = {}
-  if image_path:
-    image_path = os.path.abspath(image_path)
-
-    image_hash = case_db.query_single_row(
-        'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format(
-            image_path))
-
-    images[image_hash[0]] = image_path
-  else:
-    print(
-        'No image specified, searching all images in case \'{0:s}\''.format(
-            case))
-    image_hashes = case_db.query(
-        'SELECT image_hash FROM image_case WHERE case_id = \'{0:s}\''.format(
-            case))
-    for image_hash in image_hashes:
-      image_hash = image_hash[0]
-      image_path = case_db.query_single_row(
-          'SELECT image_path FROM images WHERE image_hash = \'{0:s}\''.format(
-              image_hash))
-
-      images[image_hash] = image_path[0]
-
-  for image_hash, image_path in images.items():
-    print('\n\nSearching {0:s} ({1:s})'.format(images[image_hash], image_hash))
-    index = ''.join(('es', image_hash))
-    if query_list:
-      with open(query_list, 'r') as search_terms:
-        print('\n*** Searching for terms in \'{0:s}\'...'.format(query_list))
-        for term in search_terms:
-          term = ''.join(('"', term.strip(), '"'))
-          results = search_index(index, term)
-          if results['hits']['total']['value'] > 0:
-            print(
-                '{0:s} - {1:d} hits'.format(
-                    term, results['hits']['total']['value']))
-    else:
-      print('\n*** Searching for \'{0:s}\'...'.format(query))
-      results = search_index(index, query)
-      print('Returned {0:d} results:'.format(results['hits']['total']['value']))
-      for hit in results['hits']['hits']:
-        filename = image.get_filename_from_offset(
-            image_path, hit['_source']['image'], int(hit['_source']['offset']))
-        if hit['_source']['file_offset']:
-          print(
-              'Offset: {0:d}\tFile: {1:s}\tFile offset:{2:s}\t'
-              'String: {3:s}'.format(
-                  hit['_source']['offset'], filename,
-                  hit['_source']['file_offset'],
-                  hit['_source']['data'].strip()))
-        else:
-          print(
-              'Offset: {0:d}\tFile: {1:s}\tString: {2:s}'.format(
-                  hit['_source']['offset'], filename,
-                  hit['_source']['data'].strip()))
-
-
-def search_index(index_id, search_query):
-  """ElasticSearch search function.
-
-  Args:
-    index_id: The ID of the index to be searched
-    search_query: The query to run against the index
-
-  Returns:
-    Search results returned
-  """
-  es = ElasticsearchDataStore()
-  return es.search(index_id, search_query)
-
-
-def main():
-  """Main DFDewey function."""
-  args = parse_args()
-  if not args.search and not args.search_list:
-    process_image(
-        args.image, args.case, not args.no_base64, not args.no_gzip,
-        not args.no_zip)
-  elif args.search:
-    search(args.search, args.case, args.image)
-  elif args.search_list:
-    search(None, args.case, args.image, args.search_list)
+  # Log to stdout
+  console_handler = logging.StreamHandler(sys.stdout)
+  console_handler.setLevel(logging.INFO)
+  formatter = logging.Formatter('[%(levelname)s] %(message)s')
+  console_handler.setFormatter(formatter)
+  log.addHandler(console_handler)


 if __name__ == '__main__':
--- a/dfdewey/utils/image.py
+++ b/dfdewey/utils/image.py
@ -1,414 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image File Access Functions."""
-
-import pytsk3
-
-from dfdewey.datastore.postgresql import PostgresqlDataStore
-
-
-def initialise_block_db(image_path, image_hash, case):
-  """Creates a new image database.
-
-  Args:
-    image_path: Path to image file
-    image_hash: MD5 of the image
-    case: Case ID
-
-  Returns:
-    Boolean value to indicate whether the image has already been processed
-  """
-  img = pytsk3.Img_Info(image_path)
-
-  block_db = PostgresqlDataStore(autocommit=True)
-  image_exists = check_tracking_database(block_db, image_path, image_hash, case)
-
-  if not image_exists:
-    db_name = ''.join(('fs', image_hash))
-    block_db.execute('CREATE DATABASE {0:s}'.format(db_name))
-
-    block_db.switch_database(db_name=db_name)
-
-    populate_block_db(img, block_db, batch_size=1500)
-
-  return image_exists
-
-
-def check_tracking_database(tracking_db, image_path, image_hash, case):
-  """Checks if an image exists in the tracking database.
-
-  Checks if an image exists in the tracking database and adds it if not.
-  If the image exists, but is not associated with the given case ID, will add
-  the association.
-
-  Args:
-    tracking_db: PostgreSQL database
-    image_path: Path to image file
-    image_hash: MD5 of the image
-    case: Case ID
-
-  Returns:
-    Boolean value to indicate the existence of the image
-  """
-  tables_exist = tracking_db.table_exists('images')
-
-  image_exists = False
-  if not tables_exist:
-    tracking_db.execute(
-        'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)')
-
-    tracking_db.execute(
-        """
-        CREATE TABLE image_case (
-          case_id TEXT, image_hash TEXT REFERENCES images(image_hash),
-          PRIMARY KEY (case_id, image_hash))""")
-  else:
-    image_exists = tracking_db.value_exists('images', 'image_hash', image_hash)
-
-  image_case_exists = False
-  if image_exists:
-    image_case = tracking_db.query_single_row(
-        """
-        SELECT 1 from image_case
-        WHERE image_hash = '{0:s}' AND case_id = '{1:s}'""".format(
-            image_hash, case))
-    if image_case:
-      image_case_exists = True
-
-  if not image_exists:
-    tracking_db.execute(
-        """
-        INSERT INTO images (image_path, image_hash)
-        VALUES ('{0:s}', '{1:s}')""".format(image_path, image_hash))
-  if not image_case_exists:
-    tracking_db.execute(
-        """
-        INSERT INTO image_case (case_id, image_hash)
-        VALUES ('{0:s}', '{1:s}')""".format(case, image_hash))
-
-  return image_exists
-
-
-def populate_block_db(img, block_db, batch_size=1500):
-  """Creates a new image block database.
-
-  Args:
-    img: pytsk image info object
-    block_db: PostgreSQL database
-    batch_size: Number of rows to insert at a time
-  """
-  print('Image database does not already exist. Parsing image filesystem(s)...')
-  block_db.execute(
-      'CREATE TABLE blocks (block INTEGER, inum INTEGER, part INTEGER)')
-  block_db.execute(
-      'CREATE TABLE files (inum INTEGER, filename TEXT, part INTEGER)')
-
-  has_partition_table = False
-  try:
-    volume = pytsk3.Volume_Info(img)
-    if volume:
-      print('Image has a partition table...')
-      has_partition_table = True
-    rows = []
-    for part in volume:
-      print(
-          'Parsing partition {0:d}: {1:s}'.format(
-              part.addr, part.desc.decode('utf-8')))
-      if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC:
-        continue
-      filesystem = pytsk3.FS_Info(
-          img, offset=part.start * volume.info.block_size)
-      for inode in range(filesystem.info.first_inum,
-                         filesystem.info.last_inum + 1):
-        file = filesystem.open_meta(inode)
-        if file.info.meta.nlink > 0:
-          for attr in file:
-            for run in attr:
-              for block in range(run.len):
-                rows.append((
-                    run.addr + block,
-                    inode,
-                    part.addr,
-                ))
-                if len(rows) >= batch_size:
-                  block_db.bulk_insert('blocks (block, inum, part)', rows)
-                  rows = []
-      if rows:
-        block_db.bulk_insert('blocks (block, inum, part)', rows)
-
-      # File names
-      directory = filesystem.open_dir(path='/')
-      list_directory(block_db, directory, part=part.addr, batch_size=batch_size)
-  except IOError:
-    pass
-
-  if not has_partition_table:
-    filesystem = pytsk3.FS_Info(img)
-    rows = []
-    for inode in range(filesystem.info.first_inum,
-                       filesystem.info.last_inum + 1):
-      try:
-        file = filesystem.open_meta(inode)
-        if file.info.meta.nlink > 0:
-          for attr in file:
-            for run in attr:
-              for block in range(run.len):
-                rows.append((
-                    run.addr + block,
-                    inode,
-                ))
-                if len(rows) >= batch_size:
-                  block_db.bulk_insert('blocks (block, inum)', rows)
-                  rows = []
-        if rows:
-          block_db.bulk_insert('blocks (block, inum)', rows)
-      except OSError:
-        continue
-
-    # File names
-    directory = filesystem.open_dir(path='/')
-    list_directory(block_db, directory, batch_size=batch_size)
-
-  block_db.execute('CREATE INDEX blocks_index ON blocks (block, part);')
-  block_db.execute('CREATE INDEX files_index ON files (inum, part);')
-
-
-def list_directory(
-    block_db, directory, part=None, stack=None, rows=None, batch_size=1500):
-  """Recursive function to create a filesystem listing.
-
-  Args:
-    block_db: PostgreSQL database
-    directory: pytsk directory object
-    part: Partition number
-    stack: Inode stack to control recursive filesystem parsing
-    rows: Array for batch database inserts
-    batch_size: Number of rows to insert at a time
-
-  Returns:
-    Current rows array for recursion
-  """
-  if not stack:
-    stack = []
-  if not rows:
-    rows = []
-  stack.append(directory.info.fs_file.meta.addr)
-
-  for directory_entry in directory:
-    # TODO(js): Refactor
-    if (not hasattr(directory_entry, 'info') or
-        not hasattr(directory_entry.info, 'name') or
-        not hasattr(directory_entry.info.name, 'name') or
-        directory_entry.info.meta is None or
-        directory_entry.info.name.name in [b'.', b'..'] or
-        directory_entry.info.name.flags == pytsk3.TSK_FS_NAME_FLAG_UNALLOC):
-      continue
-    try:
-      name = directory_entry.info.name.name.decode('utf-8')
-    except UnicodeDecodeError:
-      print('Unable to decode: {}'.format(directory_entry.info.name.name))
-      continue
-    if part:
-      rows.append((
-          directory_entry.info.meta.addr,
-          name.replace('\'', '\'\''),
-          part,
-      ))
-      if len(rows) >= batch_size:
-        block_db.bulk_insert('files (inum, filename, part)', rows)
-        rows = []
-    else:
-      rows.append((
-          directory_entry.info.meta.addr,
-          name.replace('\'', '\'\''),
-      ))
-      if len(rows) >= batch_size:
-        block_db.bulk_insert('files (inum, filename)', rows)
-        rows = []
-
-    try:
-      sub_directory = directory_entry.as_directory()
-      inode = directory_entry.info.meta.addr
-
-      if inode not in stack:
-        rows = list_directory(
-            block_db, sub_directory, part=part, stack=stack, rows=rows,
-            batch_size=batch_size)
-
-    except IOError:
-      pass
-
-  stack.pop(-1)
-  if not stack:
-    if part:
-      block_db.bulk_insert('files (inum, filename, part)', rows)
-    else:
-      block_db.bulk_insert('files (inum, filename)', rows)
-
-  return rows
-
-
-def get_filename_from_offset(image_path, image_hash, offset):
-  """Gets filename given a byte offset within an image.
-
-  Args:
-    image_path: Source image path
-    image_hash: Source image hash
-    offset: Byte offset within the image
-
-  Returns:
-    Filename allocated to the given offset
-  """
-  img = pytsk3.Img_Info(image_path)
-
-  db_name = ''.join(('fs', image_hash))
-  block_db = PostgresqlDataStore(db_name=db_name)
-
-  device_block_size = None
-  partition = None
-  partition_offset = None
-  unalloc_part = False
-  try:
-    volume = pytsk3.Volume_Info(img)
-    device_block_size = volume.info.block_size
-    sector_offset = offset / device_block_size
-    for part in volume:
-      if part.start <= sector_offset < part.start + part.len:
-        if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC:
-          unalloc_part = True
-        partition = part.addr
-        partition_offset = part.start
-  except IOError:
-    pass
-
-  inums = None
-  if not unalloc_part:
-    try:
-      if not partition_offset:
-        filesystem = pytsk3.FS_Info(img)
-      else:
-        offset -= partition_offset * device_block_size
-        filesystem = pytsk3.FS_Info(
-            img, offset=partition_offset * device_block_size)
-    except TypeError as e:
-      print(e)
-    block_size = filesystem.info.block_size
-
-    inums = get_inums(block_db, offset / block_size, part=partition)
-
-  filenames = []
-  if inums:
-    for i in inums:
-      real_inum = i[0]
-      if i[0] == 0 and filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT:
-        mft_record_size_offset = 0x40
-        if partition_offset:
-          mft_record_size_offset = \
-              mft_record_size_offset + (partition_offset * device_block_size)
-        mft_record_size = int.from_bytes(
-            img.read(mft_record_size_offset, 1), 'little', signed=True)
-        if mft_record_size < 0:
-          mft_record_size = 2**(mft_record_size * -1)
-        else:
-          mft_record_size = mft_record_size * block_size
-        real_inum = get_resident_inum(offset, filesystem, mft_record_size)
-      filename = get_filename(block_db, real_inum, part=partition)
-      if filename and not filenames:
-        filenames.append('{0:s} ({1:d})'.format(filename, real_inum))
-      else:
-        if '{0:s} ({1:d})'.format(filename, real_inum) not in filenames:
-          filenames.append('{0:s} ({1:d})'.format(filename, real_inum))
-
-  if not filenames:
-    return 'No filenames found'
-  else:
-    return ' | '.join(filenames)
-
-
-def get_inums(block_db, block, part=None):
-  """Gets inode number from block offset.
-
-  Args:
-    block_db: PostgreSQL database
-    block: Block offset within the image
-    part: Partition number
-
-  Returns:
-    Inode number(s) of the given block or None
-  """
-  if part:
-    inums = block_db.query(
-        'SELECT inum FROM blocks WHERE block = {0:d} AND part = {1:d}'.format(
-            int(block), part))
-  else:
-    inums = block_db.query(
-        'SELECT inum FROM blocks WHERE block = {0:d}'.format(int(block)))
-
-  return inums
-
-
-def get_resident_inum(offset, filesystem, mft_record_size):
-  """Gets the inode number associated with NTFS $MFT resident data.
-
-  Args:
-    offset: Data offset within volume
-    filesystem: pytsk3 FS_INFO object
-    mft_record_size: Size of an $MFT entry
-
-  Returns:
-    inode number of resident data
-  """
-  block_size = filesystem.info.block_size
-  offset_block = int(offset / block_size)
-
-  inode = filesystem.open_meta(0)
-  mft_entry = 0
-  for attr in inode:
-    for run in attr:
-      for block in range(run.len):
-        if run.addr + block == offset_block:
-          mft_entry += int(
-              (offset - (offset_block * block_size)) / mft_record_size)
-          return mft_entry
-        else:
-          mft_entry += int(block_size / mft_record_size)
-  return 0
-
-
-def get_filename(block_db, inum, part=None):
-  """Gets filename given an inode number.
-
-  Args:
-    block_db: PostgreSQL database
-    inum: Inode number of target file
-    part: Partition number
-
-  Returns:
-    Filename of given inode or None
-  """
-  if part:
-    filenames = block_db.query(
-        'SELECT filename FROM files WHERE inum = {0:d} AND part = {1:d}'.format(
-            inum, part))
-  else:
-    filenames = block_db.query(
-        'SELECT filename FROM files WHERE inum = {0:d}'.format(inum))
-
-  if filenames:
-    filename = filenames[0][0]
-  else:
-    filename = 'No filenames found'
-
-  return filename
--- a/dfdewey/utils/image_processor.py
+++ b/dfdewey/utils/image_processor.py
@ -0,0 +1,637 @@
+# -*- coding: utf-8 -*-
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor."""
+
+from datetime import datetime
+import logging
+import os
+import subprocess
+import tempfile
+
+from dfvfs.helpers import volume_scanner
+from dfvfs.lib import definitions as dfvfs_definitions
+from dfvfs.lib import errors as dfvfs_errors
+from dfvfs.resolver import resolver
+from dfvfs.volume import tsk_volume_system
+import pytsk3
+
+from dfdewey.datastore.elastic import ElasticsearchDataStore
+from dfdewey.datastore.postgresql import PostgresqlDataStore
+
+BATCH_SIZE = 1500
+STRING_INDEXING_LOG_INTERVAL = 10000000
+
+log = logging.getLogger('dfdewey.image_processor')
+
+
+class _StringRecord():
+  """Elasticsearch string record.
+
+  Attributes:
+    image: Hash to identify the source image of the string
+    offset: Byte offset of the string within the source image
+    file_offset: If the string is extracted from a compressed stream, the byte
+        offset within the stream
+    data: The string to be indexed
+  """
+
+  def __init__(self):
+    self.image = ''
+    self.offset = 0
+    self.file_offset = None
+    self.data = ''
+
+
+class FileEntryScanner(volume_scanner.VolumeScanner):
+  """File entry scanner."""
+
+  _NON_PRINTABLE_CHARACTERS = list(range(0, 0x20)) + list(range(0x7f, 0xa0))
+  _ESCAPE_CHARACTERS = str.maketrans({
+      value: '\\x{0:02x}'.format(value) for value in _NON_PRINTABLE_CHARACTERS
+  })
+
+  def __init__(self, mediator=None):
+    """Initializes a file entry scanner.
+
+    Args:
+      mediator (VolumeScannerMediator): a volume scanner mediator.
+    """
+    super().__init__(mediator=mediator)
+    self._datastore = None
+    self._list_only_files = False
+    self._rows = []
+    self._volumes = {}
+
+  def _get_display_path(self, path_spec, path_segments, data_stream_name):
+    """Retrieves a path to display.
+
+    Args:
+      path_spec (dfvfs.PathSpec): path specification of the file entry.
+      path_segments (list[str]): path segments of the full path of the file
+          entry.
+      data_stream_name (str): name of the data stream.
+
+    Returns:
+      str: path to display.
+    """
+    display_path = ''
+
+    if path_spec.HasParent():
+      parent_path_spec = path_spec.parent
+      if parent_path_spec and parent_path_spec.type_indicator == (
+          dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION):
+        display_path = ''.join([display_path, parent_path_spec.location])
+
+    path_segments = [
+        segment.translate(self._ESCAPE_CHARACTERS) for segment in path_segments
+    ]
+    display_path = ''.join([display_path, '/'.join(path_segments)])
+
+    if data_stream_name:
+      data_stream_name = data_stream_name.translate(self._ESCAPE_CHARACTERS)
+      display_path = ':'.join([display_path, data_stream_name])
+
+    return display_path or '/'
+
+  def _get_inode(self, path_spec):
+    """Gets the inode from a file entry path spec.
+
+    Args:
+      path_spec (dfvfs.PathSpec): file entry path spec.
+    """
+    inode = None
+    if path_spec.type_indicator == dfvfs_definitions.TYPE_INDICATOR_NTFS:
+      inode = getattr(path_spec, 'mft_entry', None)
+    else:
+      inode = getattr(path_spec, 'inode', None)
+    return inode
+
+  def _get_tsk_partition_path_spec(self, path_spec):
+    """Gets the path spec for the TSK partition.
+
+    Args:
+      path_spec (dfvfs.PathSpec): path spec of the volume.
+
+    Returns:
+      TSK partition path_spec or None.
+    """
+    partition_path_spec = None
+    while path_spec.HasParent():
+      type_indicator = path_spec.type_indicator
+      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION:
+        partition_path_spec = path_spec
+        break
+      path_spec = path_spec.parent
+    return partition_path_spec
+
+  def _get_volume_location(self, path_spec):
+    """Gets volume location / identifier for the given path spec.
+
+    Args:
+      path_spec (dfvfs.PathSpec): path spec of the volume.
+
+    Returns:
+      Volume location / identifier.
+    """
+    location = getattr(path_spec, 'location', None)
+    while path_spec.HasParent():
+      type_indicator = path_spec.type_indicator
+      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION:
+        if location in ('\\', '/'):
+          location = getattr(path_spec, 'location', None)
+        break
+      path_spec = path_spec.parent
+    return location
+
+  def _list_file_entry(
+      self, file_system, file_entry, parent_path_segments, location):
+    """Lists a file entry.
+
+    Args:
+      file_system (dfvfs.FileSystem): file system that contains the file entry.
+      file_entry (dfvfs.FileEntry): file entry to list.
+      parent_path_segments (str): path segments of the full path of the parent
+          file entry.
+      location (str): volume location / identifier.
+    """
+    path_segments = parent_path_segments + [file_entry.name]
+
+    inode = self._get_inode(file_entry.path_spec)
+    filename = self._get_display_path(file_entry.path_spec, path_segments, '')
+    if not self._list_only_files or file_entry.IsFile():
+      if inode is not None:
+        self._rows.append((
+            inode,
+            filename,
+            location,
+        ))
+        for data_stream in file_entry.data_streams:
+          if not data_stream.IsDefault():
+            filename = ':'.join((filename, data_stream.name))
+            self._rows.append((
+                inode,
+                filename,
+                location,
+            ))
+        if len(self._rows) >= BATCH_SIZE:
+          self._datastore.bulk_insert(
+              'files (inum, filename, part)', self._rows)
+          self._rows = []
+
+    for sub_file_entry in file_entry.sub_file_entries:
+      self._list_file_entry(
+          file_system, sub_file_entry, path_segments, location)
+
+  def get_volume_extents(self, image_path):
+    """Gets the extents of all volumes.
+
+    Args:
+      image_path (str): path of the source image.
+
+    Returns:
+      Volume location / identifier, offset, and size for all volumes.
+    """
+    if not self._volumes or self._source_path != image_path:
+      base_path_specs = self.GetBasePathSpecs(image_path)
+
+      for path_spec in base_path_specs:
+        partition_path_spec = self._get_tsk_partition_path_spec(path_spec)
+        if not partition_path_spec:
+          location = getattr(path_spec, 'location', None)
+          self._volumes[location] = {'start': 0, 'end': None}
+        else:
+          location = getattr(partition_path_spec, 'location', None)
+          partition_offset = None
+          partition_size = None
+
+          volume_system = tsk_volume_system.TSKVolumeSystem()
+          try:
+            volume_system.Open(partition_path_spec)
+            volume_identifier = location.replace('/', '')
+            volume = volume_system.GetVolumeByIdentifier(volume_identifier)
+
+            partition_offset = volume.extents[0].offset
+            partition_size = volume.extents[0].size
+          except dfvfs_errors.VolumeSystemError as e:
+            log.error('Could not process partition: %s', e)
+
+          self._volumes[location] = {
+              'start': partition_offset,
+              'end': partition_offset + partition_size
+          }
+
+    return self._volumes
+
+  def parse_file_entries(self, base_path_specs, datastore):
+    """Parses file entries in the base path specification.
+
+    Stores parsed entries in the PostgreSQL datastore.
+
+    Args:
+      base_path_specs (list[dfvfs.PathSpec]): source path specification.
+      datastore (PostgresqlDataStore): PostgreSQL datastore.
+    """
+    self._datastore = datastore
+    for base_path_spec in base_path_specs:
+      file_system = resolver.Resolver.OpenFileSystem(base_path_spec)
+      file_entry = resolver.Resolver.OpenFileEntry(base_path_spec)
+      if file_entry is None:
+        log.warning(
+            'Unable to open base path specification: %s',
+            base_path_spec.comparable)
+        return
+
+      location = self._get_volume_location(base_path_spec)
+      self._list_file_entry(file_system, file_entry, [], location)
+    if self._rows:
+      self._datastore.bulk_insert('files (inum, filename, part)', self._rows)
+      self._rows = []
+
+
+class ImageProcessor():
+  """Image processor class.
+
+  Attributes:
+    case (str): case ID.
+    elasticsearch (ElasticsearchDataStore): elasticsearch datastore.
+    image_hash (str): MD5 hash of the image.
+    image_path (str): path to source image.
+    options (ImageProcessorOptions): image processor options.
+    output_path (str): output directory for string extraction.
+    path_specs (dfvfs.PathSpec): volume path specs.
+    postgresql (PostgresqlDataStore): postgresql database.
+    scanner (FileEntryScanner): dfvfs volume / file entry scanner.
+  """
+
+  def __init__(self, case, image_path, options):
+    """Create an image processor."""
+    super().__init__()
+    self.case = case
+    self.elasticsearch = None
+    self.image_hash = None
+    self.image_path = image_path
+    self.options = options
+    self.output_path = None
+    self.path_specs = []
+    self.postgresql = None
+    self.scanner = None
+
+  def _already_parsed(self):
+    """Check if image is already parsed.
+
+    Checks whether the image is already in the database.
+    If so, checks whether it's attached to the case.
+    Adds the image to the database and attaches it to the case.
+
+    Returns:
+      True if image has already been parsed, False if not.
+    """
+    tables_exist = self.postgresql.table_exists('images')
+
+    image_exists = False
+    if not tables_exist:
+      self._initialise_database()
+    else:
+      image_exists = self.postgresql.value_exists(
+          'images', 'image_hash', self.image_hash)
+
+    # Even if the image has already been parsed, it may have been in a different
+    # case.
+    image_case_exists = False
+    if image_exists:
+      image_case = self.postgresql.query_single_row((
+          'SELECT 1 from image_case '
+          'WHERE image_hash = \'{0:s}\' AND case_id = \'{1:s}\'').format(
+              self.image_hash, self.case))
+      if image_case:
+        image_case_exists = True
+    else:
+      self.postgresql.execute((
+          'INSERT INTO images (image_path, image_hash) '
+          'VALUES (\'{0:s}\', \'{1:s}\')').format(
+              self.image_path, self.image_hash))
+
+    if not image_case_exists:
+      self.postgresql.execute((
+          'INSERT INTO image_case (case_id, image_hash) '
+          'VALUES (\'{0:s}\', \'{1:s}\')').format(self.case, self.image_hash))
+
+    return image_exists
+
+  def _create_filesystem_database(self):
+    """Create a filesystem database for the image."""
+    self.postgresql.execute((
+        'CREATE TABLE blocks (block INTEGER, inum INTEGER, part TEXT, '
+        'PRIMARY KEY (block, inum, part))'))
+    self.postgresql.execute((
+        'CREATE TABLE files (inum INTEGER, filename TEXT, part TEXT, '
+        'PRIMARY KEY (inum, filename, part))'))
+
+  def _extract_strings(self):
+    """String extraction.
+
+    Extract strings from the image using bulk_extractor.
+    """
+    cmd = [
+        'bulk_extractor', '-o', self.output_path, '-x', 'all', '-e', 'wordlist'
+    ]
+
+    if self.options.base64:
+      cmd.extend(['-e', 'base64'])
+    if self.options.gunzip:
+      cmd.extend(['-e', 'gzip'])
+    if self.options.unzip:
+      cmd.extend(['-e', 'zip'])
+
+    cmd.extend(['-S', 'strings=YES', '-S', 'word_max=1000000'])
+    cmd.append(self.image_path)
+
+    log.info('Running bulk_extractor: [%s]', ' '.join(cmd))
+    try:
+      output = subprocess.check_output(cmd)
+    except subprocess.CalledProcessError as e:
+      raise RuntimeError('String extraction failed.') from e
+    md5_offset = output.index(b'MD5') + 19
+    self.image_hash = output[md5_offset:md5_offset + 32].decode('utf-8')
+
+  def _get_volume_details(self, path_spec):
+    """Logs volume details for the given path spec.
+
+    Args:
+      path_spec (dfvfs.PathSpec): path spec of the volume.
+
+    Returns:
+      Volume location / identifier and byte offset.
+    """
+    location = getattr(path_spec, 'location', None)
+    start_offset = 0
+    while path_spec.HasParent():
+      type_indicator = path_spec.type_indicator
+      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK_PARTITION:
+        if location in ('\\', '/'):
+          location = getattr(path_spec, 'location', None)
+        start_offset = getattr(path_spec, 'start_offset', 0)
+        break
+      path_spec = path_spec.parent
+    return location, start_offset
+
+  def _index_record(self, index_name, string_record):
+    """Index a single record.
+
+    Args:
+      index_name: ID of the elasticsearch index.
+      string_record: String record to be indexed.
+
+    Returns:
+      Number of records processed
+    """
+    json_record = {
+        'image': string_record.image,
+        'offset': string_record.offset,
+        'file_offset': string_record.file_offset,
+        'data': string_record.data
+    }
+    return self.elasticsearch.import_event(index_name, event=json_record)
+
+  def _index_strings(self):
+    """Index the extracted strings."""
+    self.elasticsearch = ElasticsearchDataStore()
+    index_name = ''.join(('es', self.image_hash))
+    if self.elasticsearch.index_exists(index_name):
+      log.info('Image already indexed: [%s]', self.image_path)
+    else:
+      index_name = self.elasticsearch.create_index(index_name=index_name)
+      log.info('Index %s created.', index_name)
+
+      string_list = os.path.join(self.output_path, 'wordlist.txt')
+      records = 0
+      with open(string_list, 'r') as strings:
+        for line in strings:
+          # Ignore the comments added by bulk_extractor
+          if not line.startswith('#'):
+            string_record = _StringRecord()
+            string_record.image = self.image_hash
+
+            # Split each string into offset and data
+            line = line.split('\t')
+            offset = line[0]
+            data = '\t'.join(line[1:])
+
+            # If the string is from a decoded / decompressed stream, split the
+            # offset into image offset and file offset
+            if offset.find('-') > 0:
+              offset = offset.split('-')
+              image_offset = offset[0]
+              file_offset = '-'.join(offset[1:])
+              string_record.offset = int(image_offset)
+              string_record.file_offset = file_offset
+            else:
+              string_record.offset = int(offset)
+
+            string_record.data = data
+            records = self._index_record(index_name, string_record)
+
+            if records % STRING_INDEXING_LOG_INTERVAL == 0:
+              log.info('Indexed %d records...', records)
+      # Flush the import buffer
+      records = self.elasticsearch.import_event(index_name)
+      log.info('Indexed %d records...', records)
+
+  def _initialise_database(self):
+    """Initialse the image database."""
+    self.postgresql.execute(
+        'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)')
+
+    self.postgresql.execute((
+        'CREATE TABLE image_case ('
+        'case_id TEXT, image_hash TEXT REFERENCES images(image_hash), '
+        'PRIMARY KEY (case_id, image_hash))'))
+
+  def _parse_filesystems(self):
+    """Filesystem parsing.
+
+    Parse each filesystem to create a mapping from byte offsets to files.
+    """
+    self.postgresql = PostgresqlDataStore(autocommit=True)
+    if self._already_parsed():
+      log.info('Image already parsed: [%s]', self.image_path)
+    else:
+      db_name = ''.join(('fs', self.image_hash))
+      self.postgresql.execute('CREATE DATABASE {0:s}'.format(db_name))
+      self.postgresql.switch_database(db_name=db_name)
+
+      self._create_filesystem_database()
+
+      # Scan image for volumes
+      mediator = UnattendedVolumeScannerMediator()
+      try:
+        self.scanner = FileEntryScanner(mediator=mediator)
+        self.path_specs = self.scanner.GetBasePathSpecs(self.image_path)
+        log.info(
+            'Found %d volume%s in [%s]:', len(self.path_specs),
+            '' if len(self.path_specs) == 1 else 's', self.image_path)
+      except dfvfs_errors.ScannerError as e:
+        log.error('Error scanning for partitions: %s', e)
+
+      for path_spec in self.path_specs:
+        location, start_offset = self._get_volume_details(path_spec)
+        log.info(
+            '%s: %s (Offset %d)', location, path_spec.type_indicator,
+            start_offset)
+        if path_spec.type_indicator in (dfvfs_definitions.TYPE_INDICATOR_NTFS,
+                                        dfvfs_definitions.TYPE_INDICATOR_TSK):
+          self._parse_inodes(location, start_offset)
+          self.scanner.parse_file_entries([path_spec], self.postgresql)
+        else:
+          log.warning(
+              'Volume type %s is not supported.', path_spec.type_indicator)
+
+  def _parse_inodes(self, location, start_offset):
+    """Parse filesystem inodes.
+
+    Create a mapping from blocks to inodes.
+
+    Args:
+      location (str): location / identifier of the volume.
+      start_offset (int): byte offset of the volume.
+    """
+    rows = []
+    image = pytsk3.Img_Info(self.image_path)
+    filesystem = pytsk3.FS_Info(image, offset=start_offset)
+    for inode in range(filesystem.info.first_inum,
+                       filesystem.info.last_inum + 1):
+      file_metadata = filesystem.open_meta(inode)
+      if file_metadata.info.meta.nlink > 0:
+        for attribute in file_metadata:
+          for run in attribute:
+            for block in range(run.len):
+              rows.append((
+                  run.addr + block,
+                  inode,
+                  location,
+              ))
+              if len(rows) >= BATCH_SIZE:
+                self.postgresql.bulk_insert('blocks (block, inum, part)', rows)
+                rows = []
+    if rows:
+      self.postgresql.bulk_insert('blocks (block, inum, part)', rows)
+
+  def process_image(self):
+    """Process the image."""
+    self.output_path = tempfile.mkdtemp()
+    log.info('* Processing start: %s', datetime.now())
+    self._extract_strings()
+    log.info('String extraction complete.')
+
+    log.info('* Parsing image: %s', datetime.now())
+    self._parse_filesystems()
+    log.info('Parsing complete.')
+
+    log.info('* Indexing strings: %s', datetime.now())
+    self._index_strings()
+    log.info('Indexing complete.')
+
+    log.info('* Processing complete: %s', datetime.now())
+
+
+class ImageProcessorOptions():
+  """Image processor options.
+
+  Attributes:
+    base64 (bool): decode base64.
+    gunzip (bool): decompress gzip.
+    unzip (bool): decompress zip.
+  """
+
+  def __init__(self, base64=True, gunzip=True, unzip=True):
+    """Initialise image processor options."""
+    super().__init__()
+    self.base64 = base64
+    self.gunzip = gunzip
+    self.unzip = unzip
+
+
+class UnattendedVolumeScannerMediator(volume_scanner.VolumeScannerMediator):
+  """Unattended volume scanner mediator."""
+
+  def GetAPFSVolumeIdentifiers(self, volume_system, volume_identifiers):
+    """Retrieves APFS volume identifiers.
+
+    In an unattended execution, this method returns all volume identifiers.
+
+    Args:
+      volume_system (APFSVolumeSystem): volume system.
+      volume_identifiers (list[str]): volume identifiers including prefix.
+
+    Returns:
+      list[str]: all volume identifiers including prefix.
+    """
+    prefix = 'apfs'
+    return [
+        '{0:s}{1:d}'.format(prefix, volume_index)
+        for volume_index in range(1, volume_system.number_of_volumes + 1)
+    ]
+
+  def GetPartitionIdentifiers(self, volume_system, volume_identifiers):
+    """Retrieves partition identifiers.
+
+    In an unattended execution, this method returns all partition identifiers.
+
+    Args:
+      volume_system (TSKVolumeSystem): volume system.
+      volume_identifiers (list[str]): volume identifiers including prefix.
+
+    Returns:
+      list[str]: all volume identifiers including prefix.
+    """
+    prefix = 'p'
+    return [
+        '{0:s}{1:d}'.format(prefix, volume_index)
+        for volume_index in range(1, volume_system.number_of_volumes + 1)
+    ]
+
+  def GetVSSStoreIdentifiers(self, volume_system, volume_identifiers):
+    """Retrieves VSS store identifiers.
+
+    Placeholder method for VSS support.
+
+    Args:
+      volume_system (VShadowVolumeSystem): volume system.
+      volume_identifiers (list[str]): volume identifiers including prefix.
+
+    Returns:
+      list[str]: None.
+    """
+    return []
+
+  def UnlockEncryptedVolume(
+      self, source_scanner_object, scan_context, locked_scan_node, credentials):
+    """Unlocks an encrypted volume.
+
+    Placeholder method for encrypted volume support.
+
+    Args:
+      source_scanner_object (SourceScanner): source scanner.
+      scan_context (SourceScannerContext): source scanner context.
+      locked_scan_node (SourceScanNode): locked scan node.
+      credentials (Credentials): credentials supported by the locked scan node.
+
+    Returns:
+      bool: True if the volume was unlocked.
+    """
+    log.warning(
+        'Encrypted volumes are currently unsupported: %s',
+        locked_scan_node.path_spec.CopyToDict())
+    return False
--- a/dfdewey/utils/index_searcher.py
+++ b/dfdewey/utils/index_searcher.py
@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Index searcher."""
+
+import logging
+import os
+
+from dfvfs.lib import errors as dfvfs_errors
+import pytsk3
+from tabulate import tabulate
+
+from dfdewey.datastore.elastic import ElasticsearchDataStore
+from dfdewey.datastore.postgresql import PostgresqlDataStore
+from dfdewey.utils.image_processor import (
+    FileEntryScanner, UnattendedVolumeScannerMediator)
+
+log = logging.getLogger('dfdewey.index_searcher')
+
+
+class _SearchHit():
+  """Search result.
+
+  Attributes:
+    offset: byte offset of the string within the source image.
+    filename: filename containing the string if applicable.
+    data: the responsive string.
+  """
+
+  def __init__(self):
+    self.offset = 0
+    self.filename = None
+    self.data = ''
+
+  def copy_to_dict(self):
+    """Copies the search hit to a dictionary.
+
+    Returns:
+      dict[str, object]: search hit attributes.
+    """
+    search_hit_dict = {}
+    search_hit_dict['Offset'] = self.offset
+    search_hit_dict['Filename (inode)'] = self.filename
+    search_hit_dict['String'] = self.data
+
+    return search_hit_dict
+
+
+class IndexSearcher():
+  """Index Searcher class."""
+
+  def __init__(self, case, image):
+    """Create an index searcher."""
+    super().__init__()
+    self.case = case
+    self.elasticsearch = ElasticsearchDataStore()
+    self.image = image
+    self.images = {}
+    self.postgresql = PostgresqlDataStore()
+    self.scanner = None
+
+    if image != 'all':
+      self.image = os.path.abspath(self.image)
+      self._get_image_hash()
+    else:
+      self._get_case_images()
+
+  def _get_case_images(self):
+    """Get all images for the case.
+
+    Returns:
+      A dictionary of the images in the case.
+    """
+    images = self.postgresql.query((
+        'SELECT image_hash, image_path FROM image_case NATURAL JOIN images '
+        'WHERE case_id = \'{0:s}\'').format(self.case))
+    for image_hash, image_path in images:
+      self.images[image_hash] = image_path
+
+  def _get_filenames_from_inode(self, inode, location):
+    """Gets filename(s) from an inode number.
+
+    Args:
+      inode: Inode number of target file
+      location: Partition number
+
+    Returns:
+      Filename of given inode or None
+    """
+    results = self.postgresql.query((
+        'SELECT filename FROM files '
+        'WHERE inum = {0:d} AND part = \'{1:s}\'').format(inode, location))
+    filenames = []
+    for result in results:
+      filenames.append(result[0])
+    return filenames
+
+  def _get_filename_from_offset(self, image_path, image_hash, offset):
+    """Gets filename given a byte offset within an image.
+
+    Args:
+      image_path: source image path.
+      image_hash: source image hash.
+      offset: byte offset within the image.
+
+    Returns:
+      Filename allocated to the given offset, or None.
+    """
+    filenames = []
+
+    database_name = ''.join(('fs', image_hash))
+    self.postgresql.switch_database(db_name=database_name)
+
+    volume_extents = {}
+    try:
+      if not self.scanner:
+        mediator = UnattendedVolumeScannerMediator()
+        self.scanner = FileEntryScanner(mediator=mediator)
+      volume_extents = self.scanner.get_volume_extents(image_path)
+    except dfvfs_errors.ScannerError as e:
+      log.error('Error scanning for partitions: %s', e)
+
+    hit_location = None
+    partition_offset = None
+    for location, extent in volume_extents.items():
+      if not extent['end']:
+        # Image is of a single volume
+        hit_location = location
+        partition_offset = extent['start']
+      elif extent['start'] <= offset < extent['end']:
+        hit_location = location
+        partition_offset = extent['start']
+
+    if partition_offset is not None:
+      try:
+        img = pytsk3.Img_Info(image_path)
+        filesystem = pytsk3.FS_Info(img, offset=partition_offset)
+        block_size = filesystem.info.block_size
+      except TypeError as e:
+        log.error('Error opening image: %s', e)
+
+      inodes = self._get_inodes(
+          int((offset - partition_offset) / block_size), hit_location)
+
+      if inodes:
+        for i in inodes:
+          inode = i[0]
+          # Account for resident files
+          if (i[0] == 0 and
+              filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT):
+            mft_record_size_offset = 0x40 + partition_offset
+            mft_record_size = int.from_bytes(
+                img.read(mft_record_size_offset, 1), 'little', signed=True)
+            if mft_record_size < 0:
+              mft_record_size = 2**(mft_record_size * -1)
+            else:
+              mft_record_size = mft_record_size * block_size
+            inode = self._get_ntfs_resident_inode((offset - partition_offset),
+                                                  filesystem, mft_record_size)
+
+          inode_filenames = self._get_filenames_from_inode(inode, hit_location)
+          filename = ' | '.join(inode_filenames)
+          filenames.append('{0:s} ({1:d})'.format(filename, inode))
+
+    return filenames
+
+  def _get_image_hash(self):
+    """Get an image hash from the datastore.
+
+    Returns:
+      MD5 hash for the image stored in PostgreSQL.
+    """
+    image_hash = self.postgresql.query_single_row(
+        'SELECT image_hash FROM images WHERE image_path = \'{0:s}\''.format(
+            self.image))
+    if image_hash:
+      self.images[image_hash[0]] = self.image
+
+  def _get_inodes(self, block, location):
+    """Gets inode numbers for a block offset.
+
+    Args:
+      block (int): block offset within the image.
+      location (str): Partition location / identifier.
+
+    Returns:
+      Inode number(s) of the given block or None.
+    """
+    inodes = self.postgresql.query(
+        ('SELECT inum FROM blocks '
+         'WHERE block = {0:d} AND part = \'{1:s}\'').format(block, location))
+    return inodes
+
+  def _get_ntfs_resident_inode(self, offset, filesystem, mft_record_size):
+    """Gets the inode number associated with NTFS $MFT resident data.
+
+    Args:
+      offset: data offset within volume.
+      filesystem: pytsk3 FS_INFO object.
+      mft_record_size: size of each $MFT entry.
+
+    Returns:
+      inode number of resident data
+    """
+    block_size = filesystem.info.block_size
+    offset_block = int(offset / block_size)
+
+    inode = filesystem.open_meta(0)
+    mft_entry = 0
+    for attr in inode:
+      for run in attr:
+        for block in range(run.len):
+          if run.addr + block == offset_block:
+            mft_entry += int(
+                (offset - (offset_block * block_size)) / mft_record_size)
+            return mft_entry
+          mft_entry += int(block_size / mft_record_size)
+    return 0
+
+  def list_search(self, query_list):
+    """Query a list of search terms.
+
+    Args:
+      query_list (str): path to a text file containing multiple search terms.
+    """
+    for image_hash, image_path in self.images.items():
+      index = ''.join(('es', image_hash))
+      with open(query_list, 'r') as search_terms:
+        table_data = []
+        for term in search_terms:
+          term = ''.join(('"', term.strip(), '"'))
+          results = self.elasticsearch.search(index, term)
+          hit_count = results['hits']['total']['value']
+          if hit_count > 0:
+            table_data.append({'Search term': term, 'Hits': hit_count})
+      if table_data:
+        output = tabulate(table_data, headers='keys', tablefmt='simple')
+      else:
+        output = 'No results.'
+      log.info(
+          'Searched %s (%s) for terms in %s\n\n%s\n', image_path, image_hash,
+          query_list, output)
+
+  def search(self, query):
+    """Run a single query.
+
+    Args:
+      query (str): query to run.
+    """
+    for image_hash, image_path in self.images.items():
+      log.info('Searching %s (%s) for "%s"', image_path, image_hash, query)
+      index = ''.join(('es', image_hash))
+      results = self.elasticsearch.search(index, query)
+      result_count = results['hits']['total']['value']
+      time_taken = results['took']
+
+      results = results['hits']['hits']
+      hits = []
+      for result in results:
+        hit = _SearchHit()
+        offset = str(result['_source']['offset'])
+        if result['_source']['file_offset']:
+          offset = '-'.join((offset, result['_source']['file_offset']))
+        hit.offset = offset
+        filenames = self._get_filename_from_offset(
+            image_path, image_hash, result['_source']['offset'])
+        hit.filename = '\n'.join(filenames)
+        hit.data = result['_source']['data'].strip()
+        hits.append(hit.copy_to_dict())
+      output = tabulate(hits, headers='keys', tablefmt='simple')
+      log.info(
+          'Returned %d results in %dms.\n\n%s\n', result_count, time_taken,
+          output)
--- a/dfvfs_requirements.txt
+++ b/dfvfs_requirements.txt
@ -0,0 +1,26 @@
+pip >= 7.0.0
+PyYAML >= 3.10
+cffi >= 1.9.1
+cryptography >= 2.0.2
+dfdatetime >= 20200809
+dtfabric >= 20170524
+idna >= 2.5
+libbde-python >= 20140531
+libewf-python >= 20131210
+libfsapfs-python >= 20201107
+libfsext-python >= 20200819
+libfshfs-python >= 20201103
+libfsntfs-python >= 20200921
+libfsxfs-python >= 20201114
+libfvde-python >= 20160719
+libfwnt-python >= 20160418
+libluksde-python >= 20200101
+libqcow-python >= 20131204
+libsigscan-python >= 20191221
+libsmdev-python >= 20140529
+libsmraw-python >= 20140612
+libvhdi-python >= 20201014
+libvmdk-python >= 20140421
+libvshadow-python >= 20160109
+libvslvm-python >= 20160109
+pytsk3 >= 20160721
--- a/docs/usage.md
+++ b/docs/usage.md
@ -1,14 +1,14 @@
 # Using dfDewey

 ```shell
-usage: dfdcli.py [-h] -c CASE [-i IMAGE] [--no_base64] [--no_gzip] [--no_zip]
-                 [-s SEARCH] [--search_list SEARCH_LIST]
+usage: dfdcli.py [-h] [--no_base64] [--no_gzip] [--no_zip] [-s SEARCH] [--search_list SEARCH_LIST] case [image]
+
+positional arguments:
+  case                  case ID
+  image                 image file (default: 'all')

 optional arguments:
  -h, --help            show this help message and exit
-  -c CASE, --case CASE  case ID
-  -i IMAGE, --image IMAGE
-                        image file
  --no_base64           don't decode base64
  --no_gzip             don't decompress gzip
  --no_zip              don't decompress zip
@ -16,6 +16,7 @@ optional arguments:
                        search query
  --search_list SEARCH_LIST
                        file with search queries
+
 ```

 ## Docker
@ -59,7 +60,7 @@ docker run --network=host -v ~/images/:/mnt/images <docker_name> dfdewey -h
 To process an image in dfDewey, you need to supply a `CASE` and `IMAGE`.

 ```shell
-dfdcli.py -c testcase -i /path/to/image.dd
+dfdcli.py testcase /path/to/image.dd
 ```

 dfDewey will have bulk_extractor decode base64 data, and decompress gzip / zip
@ -72,7 +73,7 @@ To search the index for a single image, you need to supply a `CASE`, `IMAGE`,
 and `SEARCH`.

 ```shell
-dfdcli.py -c testcase -i /path/to/image.dd -s foo
+dfdcli.py testcase /path/to/image.dd -s 'foo'
 ```

 If an `IMAGE` is not provided, dfDewey will search all images in the given case.
@ -82,5 +83,5 @@ a text file one per line. In this case, only the number of results for each term
 is returned.

 ```shell
-dfdcli.py -c testcase -i /path/to/image.dd --search_list search_terms.txt
+dfdcli.py testcase /path/to/image.dd --search_list search_terms.txt
 ```
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,6 @@
+dfvfs
 elasticsearch
 psycopg2-binary
 pytsk3
 six
+tabulate
--- a/setup.py
+++ b/setup.py
@ -31,6 +31,8 @@ DFDEWEY_DESCRIPTION = (
 requirements = []
 with open('requirements.txt','r') as f:
  requirements = f.read().splitlines()
+with open('dfvfs_requirements.txt','r') as f:
+  requirements.extend(f.read().splitlines())
 setup(
    name='dfDewey',
    version=dfdewey.__version__,