Refactoring CLI, processing and searching

2020-11-20 14:56:23 +11:00 · 2020-11-20 14:56:23 +11:00 · 15ad0beb02
commit 15ad0beb02
parent 2f78367a37
3 changed files with 27 additions and 416 deletions
--- a/dfdewey/dfdcli.py
+++ b/dfdewey/dfdcli.py
@ -68,7 +68,7 @@ def main():
    if args.search:
      index_searcher.search(args.search)
    elif args.search_list:
-      pass
+      index_searcher.list_search(args.search_list)
 def parse_args():
--- a/dfdewey/utils/image.py
+++ b/dfdewey/utils/image.py
@ -1,414 +0,0 @@
 # -*- coding: utf-8 -*-
 # Copyright 2020 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image File Access Functions."""
 import pytsk3
 from dfdewey.datastore.postgresql import PostgresqlDataStore
 def initialise_block_db(image_path, image_hash, case):
  """Creates a new image database.
  Args:
    image_path: Path to image file
    image_hash: MD5 of the image
    case: Case ID
  Returns:
    Boolean value to indicate whether the image has already been processed
  """
  img = pytsk3.Img_Info(image_path)
  block_db = PostgresqlDataStore(autocommit=True)
  image_exists = check_tracking_database(block_db, image_path, image_hash, case)
  if not image_exists:
    db_name = ''.join(('fs', image_hash))
    block_db.execute('CREATE DATABASE {0:s}'.format(db_name))
    block_db.switch_database(db_name=db_name)
    populate_block_db(img, block_db, batch_size=1500)
  return image_exists
 def check_tracking_database(tracking_db, image_path, image_hash, case):
  """Checks if an image exists in the tracking database.
  Checks if an image exists in the tracking database and adds it if not.
  If the image exists, but is not associated with the given case ID, will add
  the association.
  Args:
    tracking_db: PostgreSQL database
    image_path: Path to image file
    image_hash: MD5 of the image
    case: Case ID
  Returns:
    Boolean value to indicate the existence of the image
  """
  tables_exist = tracking_db.table_exists('images')
  image_exists = False
  if not tables_exist:
    tracking_db.execute(
        'CREATE TABLE images (image_path TEXT, image_hash TEXT PRIMARY KEY)')
    tracking_db.execute(
        """
        CREATE TABLE image_case (
          case_id TEXT, image_hash TEXT REFERENCES images(image_hash),
          PRIMARY KEY (case_id, image_hash))""")
  else:
    image_exists = tracking_db.value_exists('images', 'image_hash', image_hash)
  image_case_exists = False
  if image_exists:
    image_case = tracking_db.query_single_row(
        """
        SELECT 1 from image_case
        WHERE image_hash = '{0:s}' AND case_id = '{1:s}'""".format(
            image_hash, case))
    if image_case:
      image_case_exists = True
  if not image_exists:
    tracking_db.execute(
        """
        INSERT INTO images (image_path, image_hash)
        VALUES ('{0:s}', '{1:s}')""".format(image_path, image_hash))
  if not image_case_exists:
    tracking_db.execute(
        """
        INSERT INTO image_case (case_id, image_hash)
        VALUES ('{0:s}', '{1:s}')""".format(case, image_hash))
  return image_exists
 def populate_block_db(img, block_db, batch_size=1500):
  """Creates a new image block database.
  Args:
    img: pytsk image info object
    block_db: PostgreSQL database
    batch_size: Number of rows to insert at a time
  """
  print('Image database does not already exist. Parsing image filesystem(s)...')
  block_db.execute(
      'CREATE TABLE blocks (block INTEGER, inum INTEGER, part INTEGER)')
  block_db.execute(
      'CREATE TABLE files (inum INTEGER, filename TEXT, part INTEGER)')
  has_partition_table = False
  try:
    volume = pytsk3.Volume_Info(img)
    if volume:
      print('Image has a partition table...')
      has_partition_table = True
    rows = []
    for part in volume:
      print(
          'Parsing partition {0:d}: {1:s}'.format(
              part.addr, part.desc.decode('utf-8')))
      if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC:
        continue
      filesystem = pytsk3.FS_Info(
          img, offset=part.start * volume.info.block_size)
      for inode in range(filesystem.info.first_inum,
                         filesystem.info.last_inum + 1):
        file = filesystem.open_meta(inode)
        if file.info.meta.nlink > 0:
          for attr in file:
            for run in attr:
              for block in range(run.len):
                rows.append((
                    run.addr + block,
                    inode,
                    part.addr,
                ))
                if len(rows) >= batch_size:
                  block_db.bulk_insert('blocks (block, inum, part)', rows)
                  rows = []
      if rows:
        block_db.bulk_insert('blocks (block, inum, part)', rows)
      # File names
      directory = filesystem.open_dir(path='/')
      list_directory(block_db, directory, part=part.addr, batch_size=batch_size)
  except IOError:
    pass
  if not has_partition_table:
    filesystem = pytsk3.FS_Info(img)
    rows = []
    for inode in range(filesystem.info.first_inum,
                       filesystem.info.last_inum + 1):
      try:
        file = filesystem.open_meta(inode)
        if file.info.meta.nlink > 0:
          for attr in file:
            for run in attr:
              for block in range(run.len):
                rows.append((
                    run.addr + block,
                    inode,
                ))
                if len(rows) >= batch_size:
                  block_db.bulk_insert('blocks (block, inum)', rows)
                  rows = []
        if rows:
          block_db.bulk_insert('blocks (block, inum)', rows)
      except OSError:
        continue
    # File names
    directory = filesystem.open_dir(path='/')
    list_directory(block_db, directory, batch_size=batch_size)
  block_db.execute('CREATE INDEX blocks_index ON blocks (block, part);')
  block_db.execute('CREATE INDEX files_index ON files (inum, part);')
 def list_directory(
    block_db, directory, part=None, stack=None, rows=None, batch_size=1500):
  """Recursive function to create a filesystem listing.
  Args:
    block_db: PostgreSQL database
    directory: pytsk directory object
    part: Partition number
    stack: Inode stack to control recursive filesystem parsing
    rows: Array for batch database inserts
    batch_size: Number of rows to insert at a time
  Returns:
    Current rows array for recursion
  """
  if not stack:
    stack = []
  if not rows:
    rows = []
  stack.append(directory.info.fs_file.meta.addr)
  for directory_entry in directory:
    # TODO(js): Refactor
    if (not hasattr(directory_entry, 'info') or
        not hasattr(directory_entry.info, 'name') or
        not hasattr(directory_entry.info.name, 'name') or
        directory_entry.info.meta is None or
        directory_entry.info.name.name in [b'.', b'..'] or
        directory_entry.info.name.flags == pytsk3.TSK_FS_NAME_FLAG_UNALLOC):
      continue
    try:
      name = directory_entry.info.name.name.decode('utf-8')
    except UnicodeDecodeError:
      print('Unable to decode: {}'.format(directory_entry.info.name.name))
      continue
    if part:
      rows.append((
          directory_entry.info.meta.addr,
          name.replace('\'', '\'\''),
          part,
      ))
      if len(rows) >= batch_size:
        block_db.bulk_insert('files (inum, filename, part)', rows)
        rows = []
    else:
      rows.append((
          directory_entry.info.meta.addr,
          name.replace('\'', '\'\''),
      ))
      if len(rows) >= batch_size:
        block_db.bulk_insert('files (inum, filename)', rows)
        rows = []
    try:
      sub_directory = directory_entry.as_directory()
      inode = directory_entry.info.meta.addr
      if inode not in stack:
        rows = list_directory(
            block_db, sub_directory, part=part, stack=stack, rows=rows,
            batch_size=batch_size)
    except IOError:
      pass
  stack.pop(-1)
  if not stack:
    if part:
      block_db.bulk_insert('files (inum, filename, part)', rows)
    else:
      block_db.bulk_insert('files (inum, filename)', rows)
  return rows
 def get_filename_from_offset(image_path, image_hash, offset):
  """Gets filename given a byte offset within an image.
  Args:
    image_path: Source image path
    image_hash: Source image hash
    offset: Byte offset within the image
  Returns:
    Filename allocated to the given offset
  """
  img = pytsk3.Img_Info(image_path)
  db_name = ''.join(('fs', image_hash))
  block_db = PostgresqlDataStore(db_name=db_name)
  device_block_size = None
  partition = None
  partition_offset = None
  unalloc_part = False
  try:
    volume = pytsk3.Volume_Info(img)
    device_block_size = volume.info.block_size
    sector_offset = offset / device_block_size
    for part in volume:
      if part.start <= sector_offset < part.start + part.len:
        if part.flags != pytsk3.TSK_VS_PART_FLAG_ALLOC:
          unalloc_part = True
        partition = part.addr
        partition_offset = part.start
  except IOError:
    pass
  inums = None
  if not unalloc_part:
    try:
      if not partition_offset:
        filesystem = pytsk3.FS_Info(img)
      else:
        offset -= partition_offset * device_block_size
        filesystem = pytsk3.FS_Info(
            img, offset=partition_offset * device_block_size)
    except TypeError as e:
      print(e)
    block_size = filesystem.info.block_size
    inums = get_inums(block_db, offset / block_size, part=partition)
  filenames = []
  if inums:
    for i in inums:
      real_inum = i[0]
      if i[0] == 0 and filesystem.info.ftype == pytsk3.TSK_FS_TYPE_NTFS_DETECT:
        mft_record_size_offset = 0x40
        if partition_offset:
          mft_record_size_offset = \
              mft_record_size_offset + (partition_offset * device_block_size)
        mft_record_size = int.from_bytes(
            img.read(mft_record_size_offset, 1), 'little', signed=True)
        if mft_record_size < 0:
          mft_record_size = 2**(mft_record_size * -1)
        else:
          mft_record_size = mft_record_size * block_size
        real_inum = get_resident_inum(offset, filesystem, mft_record_size)
      filename = get_filename(block_db, real_inum, part=partition)
      if filename and not filenames:
        filenames.append('{0:s} ({1:d})'.format(filename, real_inum))
      else:
        if '{0:s} ({1:d})'.format(filename, real_inum) not in filenames:
          filenames.append('{0:s} ({1:d})'.format(filename, real_inum))
  if not filenames:
    return 'No filenames found'
  else:
    return ' | '.join(filenames)
 def get_inums(block_db, block, part=None):
  """Gets inode number from block offset.
  Args:
    block_db: PostgreSQL database
    block: Block offset within the image
    part: Partition number
  Returns:
    Inode number(s) of the given block or None
  """
  if part:
    inums = block_db.query(
        'SELECT inum FROM blocks WHERE block = {0:d} AND part = {1:d}'.format(
            int(block), part))
  else:
    inums = block_db.query(
        'SELECT inum FROM blocks WHERE block = {0:d}'.format(int(block)))
  return inums
 def get_resident_inum(offset, filesystem, mft_record_size):
  """Gets the inode number associated with NTFS $MFT resident data.
  Args:
    offset: Data offset within volume
    filesystem: pytsk3 FS_INFO object
    mft_record_size: Size of an $MFT entry
  Returns:
    inode number of resident data
  """
  block_size = filesystem.info.block_size
  offset_block = int(offset / block_size)
  inode = filesystem.open_meta(0)
  mft_entry = 0
  for attr in inode:
    for run in attr:
      for block in range(run.len):
        if run.addr + block == offset_block:
          mft_entry += int(
              (offset - (offset_block * block_size)) / mft_record_size)
          return mft_entry
        else:
          mft_entry += int(block_size / mft_record_size)
  return 0
 def get_filename(block_db, inum, part=None):
  """Gets filename given an inode number.
  Args:
    block_db: PostgreSQL database
    inum: Inode number of target file
    part: Partition number
  Returns:
    Filename of given inode or None
  """
  if part:
    filenames = block_db.query(
        'SELECT filename FROM files WHERE inum = {0:d} AND part = {1:d}'.format(
            inum, part))
  else:
    filenames = block_db.query(
        'SELECT filename FROM files WHERE inum = {0:d}'.format(inum))
  if filenames:
    filename = filenames[0][0]
  else:
    filename = 'No filenames found'
  return filename
--- a/dfdewey/utils/index_searcher.py
+++ b/dfdewey/utils/index_searcher.py
@ -228,6 +228,30 @@ class IndexSearcher():
          mft_entry += int(block_size / mft_record_size)
    return 0
  def list_search(self, query_list):
    """Query a list of search terms.
    Args:
      query_list (str): path to a text file containing multiple search terms.
    """
    for image_hash, image_path in self.images.items():
      index = ''.join(('es', image_hash))
      with open(query_list, 'r') as search_terms:
        table_data = []
        for term in search_terms:
          term = ''.join(('"', term.strip(), '"'))
          results = self.elasticsearch.search(index, term)
          hit_count = results['hits']['total']['value']
          if hit_count > 0:
            table_data.append({'Search term': term, 'Hits': hit_count})
      if table_data:
        output = tabulate(table_data, headers='keys', tablefmt='simple')
      else:
        output = 'No results.'
      log.info(
          'Searched %s (%s) for terms in %s\n\n%s\n', image_path, image_hash,
          query_list, output)
  def search(self, query):
    """Run a single query.
@ -256,4 +280,5 @@ class IndexSearcher():
        hits.append(hit.copy_to_dict())
      output = tabulate(hits, headers='keys', tablefmt='simple')
      log.info(
-          'Returned %d results in %dms.\n%s', result_count, time_taken, output)
+          'Returned %d results in %dms.\n\n%s\n', result_count, time_taken,
          output)