diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9bad294
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
\ No newline at end of file
diff --git a/data/an-openioc-graph-a-different-kind-of-rule-scheme.md b/data/an-openioc-graph-a-different-kind-of-rule-scheme.md
new file mode 100644
index 0000000..8034621
--- /dev/null
+++ b/data/an-openioc-graph-a-different-kind-of-rule-scheme.md
@@ -0,0 +1,240 @@
+Despite that I think that implementing a full-fledged
+XML-editor is too complex for an operational scenario, I
+believe the OpenIOC-format, which has been in the works at
+Mandiant for a couple of years now, is quite good. They also
+have the IOC Writer which was launched at last summers Black
+Hat. OpenIOC can export to other expression languages, such
+as Yara [1], as well.
+I have been thinking of a way to combine graph knowledge
+with exactly that for a while, an expressive detection
+language based on a graph. If combining two things you love,
+I have learned that it simply can't end badly, it must end
+with something amazing. Let's give it a try!
+So I went about it, starting off by importing a sample
+Maltego-graph to Titan on HBase [2]. I basically set out
+with five connected nodes in Maltego Tungsten. Nothing
+malicious, just a national newspaper.
+Running that through my Rexster migration script results in
+a equivalent graph on the Rexster server.
+It's nice considering if you'd like to put it in a larger
+context with millions or billions of vertices you would like
+to trigger on. That is out of bounds for Maltego, or your
+desktop system in general.
+## The OpenIOC Part
+If looking at the graphs above, you will probably agree that
+it isn't especially describing of certain incidents or other
+contextual data. But what if we could combine the graph with
+something like OpenIOC? Turns out that it's conceptually
+similar. The weakness of OpenIOC is that it doesn't scale
+when firing up an OpenIOC editor - like the one Mandiant
+have created. On the other hand, if you could traverse a
+graph with OpenIOC designed around the OpenIOC format..
+Let's create a basic writer as a demonstration, which
+operates on the root level (no nesting of rules in this
+ from ioc_writer import ioc_api
+ from lxml import etree as et
+ class IOC:
+ def __init__(self):
+ self.IOC = ioc_api.IOC(name='Test', description='An IOC generated from a Python script', author='Someone')
+ self.IOC.set_created_date()
+ self.IOC.set_published_date()
+ self.IOC.set_lastmodified_date()
+ self.IOC.update_name('test_rexster')
+ self.IOC.update_description('A Test')
+ self.id = self.IOC.iocid
+ def addNode(self,label,text,type,indicator,condition='is'):
+ IndicatorItem_node = ioc_api.make_IndicatorItem_node(condition, label, text, type, indicator)
+ current_guid = IndicatorItem_node.attrib['id']
+ print current_guid
+ self.IOC.top_level_indicator.append(IndicatorItem_node)
+ def __str__(self):
+ self.xml = et.tostring(self.IOC.root, encoding='utf-8', xml_declaration=True, pretty_print=True)
+ return self.xml
+This enables us to do something like this:
+ ioc = IOC()
+ ioc.addNode('test','Just a test','domain','vg.no')
+ print ioc
+Which will again return the XML of the IOC.
+ test
+ A Test
+ Someone
+ 2014-01-28T07:15:09
+ vg.no
+Reviewing the XML above you might notice that the scheme is
+pretty transferrable to a graph, perhaps even simplifying of
+the IOC XML. Be especially aware on the following tags and
+* Content
+* The IndicatorItem condition
+* The content type
+A nested IOC might look like this (relevant excerpt):
+ vg.no
+The above implies that the domain vg.no needs to be
+accompanied with the IP-address ````.
+## Merging the Best of Two Worlds
+So now that we have had a look at the power in the structure
+of a graph and the power of expression in the OpenIOC
+XML-indicators, you might see why this is the best of two
+In the challenge of combining them both I perhaps
+oversimplified the nesting and used the two previously
+mentioned attributes in the graph, adding the content as the
+value of the node and the condition. We will also have to
+add the type attribute since that tells us what type of
+OpenIOC entry we have when reversing the process later
+on. We will have a small collision between Maltego and
+OpenIOC, since for instance an IP-address type will
+differ. So for now you will need two type attributes, one
+for Maltego and one for OpenIOC (if you plan to go both
+ways). This is left as an exersise for the reader.
+Creating an OpenIOC-compatible graph is a breeze:
+ from rexpro import RexProConnection
+ class Graph:
+ def __init__(self):
+ self.graph = RexProConnection('localhost',8184,'titan')
+ def addVertice(self,content,content_type,condition):
+ vertice_id = self.graph.execute("""
+ def v1 = g.addVertex([content:content,content_type:content_type,condition:condition])
+ return v1""",
+ {'content':content, 'content_type':content_type, 'condition':condition})
+ return vertice_id
+ def addEdge(self,vid1,vid2,label):
+ edge = self.graph.execute("""
+ def v1 = g.v(vid1)
+ def v2 = g.v(vid2)
+ g.addEdge(v1, v2, label)
+ g.commit()""",{'vid1':vid1['_id'], 'vid2':vid2['_id'], 'label':label})
+ graph=Graph()
+ v1=graph.addVertice('vg.no','domain','is')
+ v2=graph.addVertice('','ip','is')
+ graph.addEdge(v1,v2,'and')
+If you'd like to go the other way again in order to talk to
+other organisations perhaps, you will want to run the
+process in reverse:
+ from rexpro import RexProConnection
+ class RexsterIOC:
+ def __init__(self):
+ self.graph = RexProConnection('localhost',8184,'titan')
+ self.IOC = ioc_api.IOC(name='Test', description='A test IOC generated from Rexster', author='Someone')
+ self.IOC.set_created_date()
+ self.IOC.set_published_date()
+ self.IOC.set_lastmodified_date()
+ #IOC.add_link('help', self.baseurl + url)
+ self.IOC.update_name('test')
+ self.IOC.update_description('A Test')
+ self.id = self.IOC.iocid
+ self.lastId=None
+ def addNode(self,label,text,type,indicator,condition='is',addToLast=False):
+ IndicatorItem_node = ioc_api.make_IndicatorItem_node(condition, label, text, type, indicator)
+ if addToLast and self.last:
+ self.last.append(IndicatorItem_node)
+ else:
+ self.IOC.top_level_indicator.append(IndicatorItem_node)
+ current_guid = IndicatorItem_node.attrib['id']
+ self.last = IndicatorItem_node
+ def traverse(self,rootNodeId):
+ root=self.graph.execute("""return g.v(80284)""",{'vid':str(rootNodeId)})
+ self.addNode('test','Just a test',
+ root['_properties']['content_type'],
+ root['_properties']['content'],
+ root['_properties']['condition'])
+ one_level_out=self.graph.execute("""return g.v(vid).out""",{'vid':str(rootNodeId)})
+ for vertex in one_level_out:
+ self.addNode('test','Just a test',
+ vertex['_properties']['content_type'],
+ vertex['_properties']['content'],
+ vertex['_properties']['condition'],addToLast=True)
+ def __str__(self):
+ self.xml = et.tostring(self.IOC.root, encoding='utf-8', xml_declaration=True, pretty_print=True)
+ return self.xml
+ ioc = RexsterIOC()
+ ioc.traverse(80284) # the root node
+ print ioc
+One thing that you can now do is to store the indicators
+with the rest of your network data. This again will imply
+that the edges are created automatically without any need to
+actually run jobs to combine data for detecting stuff.
+That's my small concept demonstration. I think it's pretty
+I've put the scripts in a Gist for you if you'd like to give
+it a try [3].
+[1] Yara: https://github.com/mandiant/ioc_writer/tree/master/examples/openioc_to_yara
+[2] Importing a sample Maltego-graph to Titan on HBase: https://gist.github.com/tommyskg/8166472
+[3] the scripts out there: https://gist.github.com/tommyskg/8671318
diff --git a/data/apm-lock.md b/data/apm-lock.md
new file mode 100644
index 0000000..2f651d0
--- /dev/null
+++ b/data/apm-lock.md
@@ -0,0 +1,56 @@
+I have used OpenBSD for some time now and one of the things that I
+have had to work a bit on to get the way I like it, is locking the
+terminal upon apmd suspend. In other words locking the terminals
+when I close the lid.
+Since it is a bit of code and that I reuse it other places, I
+created this as a separate helper script. Thus, my
+``/etc/apm/suspend``-reference is:
+sleep 3
+The suspend file executes every time the lid is closed.
+Once upon a time I probably used different sources for this, but
+anyways the script that I currently use are two-fold. The first
+part locks all xenodm sessions with xlock:
+# get all currently running xenodm sessions
+XSESSION=$(ps -axo user,ppid,args|awk '/xenodm\/Xsession/ { print
+# lock all logged in X sessions
+for SESSION in "$XSESSION"; do
+ _USER=$(echo $SESSION | cut -f1 -d' ')
+ _PPID=$(echo $SESSION | cut -f2 -d' ')
+ _DISPLAY=$(ps -p $_PPID -o args=|cut -d' ' -f2)
+ su - $_USER -c "export DISPLAY=\"$_DISPLAY\" && $CMD_LOCK" &
+The second part of the script kills all active consoles. This is
+the most important part for me, since I most often lock the
+screen, but forget to log off the consoles.
+# kill open console TTYs
+OPEN_TTYS=$(who|awk '{print $2}'|fgrep ttyC)
+for _TTY in $OPEN_TTYS; do
+ T=$(echo $_TTY|sed 's/tty//');
+ TTY_PID=$(ps -t $T|fgrep -v COMMAND|fgrep "ksh (ksh)"|awk '{print $1}');
+ kill -9 $TTY_PID;
+Please also be aware that suspending the laptop will leave things
+in plaintext, in memory, so to truly be resistant to an evil maid
+vector you would need to power off the laptop when out of a
+controlled area.
diff --git a/data/avenger-openbsd68.md b/data/avenger-openbsd68.md
new file mode 100644
index 0000000..82a0596
--- /dev/null
+++ b/data/avenger-openbsd68.md
@@ -0,0 +1,27 @@
+Those following me on the Fediverse has recently become familiar
+with an old-school program called Mail Avenger.
+mkdir ~/.avenger
+openssl rand -base64 8 | shasum | head -c16 > ~/.avenger/.macpass
+echo "" >> ~/.avenger/.macpass
+brew install berkeley-db4
+curl -O http://www.mailavenger.org/dist/avenger-0.8.5.tar.gz
+echo "b0fc3e2e03ed010e95e561367fce7b087968df7ea6056251eba95cad14d26d37 avenger-0.8.5.tar.gz" | shasum -a 256 --check
+tar xvzf avenger-0.8.5.tar.gz
+cd avenger-0.8.5
+./configure --with-db=/usr/local/Cellar/berkeley-db@4/4.8.30
+cd util
+make macutil && install macutil ~/.local/bin/
+macutil --expire=+2M --from "Tommy S" --fromexp "address expires" --sender "t+return+*@252.no"
diff --git a/data/cognitive-automation.md b/data/cognitive-automation.md
new file mode 100644
index 0000000..0c8b70a
--- /dev/null
+++ b/data/cognitive-automation.md
@@ -0,0 +1,105 @@
+There is a lot of hype around many things in cyber
+security. One concept that is not, is called Cognitive
+Automation (CA). CA can be explained by comparing it to
+traditional automation. That is, how tasks are automated:
+like alerts correlation. By using cognitive automation, the
+way the mind works is taken into account. I believe many
+security professionals will recognise the practical aspects
+of Schulte's model for "Complexity of automation vs
+effectiveness/safety" [1].
+I've written a post on this topic years ago ("The Role of
+Cognitive Automation in Information Security"), but
+unluckily that was lost in migration. It probably needed an
+update anyways, and I believe the cyber security field is
+more mature to receive this input now rather than at that
+Cognitive automation is strongly applied in the aerospace
+industry for instance. In aerospace, long ago, there was a
+realisation that the strengths of thee human-being is the
+ability to learn, instinct, problem reduction, ability of
+abstraction and several others. The machine’s strength is
+parallel processing, objectivity, long-term monitoring,
+complex planning and decision making and so on. Schulte
+describes this concept in detail, in Man-Machine Cooperation
+model [1].
+In order to benefit from a similar model in cyber security
+there is a need to evolve the way data is extracted,
+preprocessed and prepared for human-machine interaction. As
+may be recognised at this point there are already technology
+available to provide parallel processing on the machine
+part. How a computing cluster would solve such a problem is
+the evident problem. In that regard, machine learning is the
+most promising technique to structure and classify the data
+which seems to scale really well. Efficiently ingesting,
+storing and preprocessing the data is the first stage of
+that challenge.
+Another detail that I would like to point out here, from the
+great book "The Multitasking Mind" by Salvucci and Taatgen,
+is how the human mind works with buffers (the aural, visual,
+declarative, goal, manual and problem buffers). A human can
+actually only handle one thing at once. So when analysts are
+tasked with several simultaneous tasks or roles, this will
+definitively produce bad quality results. This is really
+important to understand to all cyber security seniors and
+designers, so read the book.
+Back to how this applies in practical terms: when analysts
+manually analyse and decide by expert knowledge, classifying
+the attributes of full content data and e.g. creates Yara
+and Snort signatures, it is a reasonable assumption that a
+number of relevant attributes are never evaluated as
+potential anomalies. This greatly increases the
+possibilities of the threat groups. In aerospace cognitive
+automation there is a concept called Mission Management,
+that is similar to the problem described here.
+Now for a practical example of how cognitive automation can
+work, this time paralleled with the approach taken by
+Netflix to movie recommenders. Let's say that you have
+stored the PDFiD [2] vector of all PDF documents over the
+last ten years, passing through a network. The vector
+structure will look like:
+1. 7,7,1,1,1,1,1,1,0,1,1,0,1,0
+If 500 PDF files passes through the systems each day on
+average, that will be 1825' documents over those ten
+years. In addition qtime is a significant part of that
+vector - and other parameters could be file names and so on.
+If an analyst receives a suspicious PDF file. That file may
+initially hard to classify by the analyst. In such a case
+the system should propose other related files to look
+at. Practically speaking this saves the analyst cognitive
+capacity to use instict, pattern recognition and creativity
+to classify the document. The machine on the other hand
+maintains objectivity, has great stress resistance, can
+retrieve a lot more information, and it can process and
+pivot on all those 10 years of documents as opposed to the
+Now that you have gotten an introduction to the world of
+cognitive automation, I hope this will drive a discussion on
+how we can take our field to the next level. I am confident
+that this means understanding and solving problems before
+attempting to buy our way out of them.
+[1] Schulte, D. A. 2002. Mission management and crew assistance for military aircraft: cognitive concepts and prototype evaluation.
+[2] PDFiD: https://blog.didierstevens.com/2009/03/31/pdfid/
diff --git a/data/converting-pst.md b/data/converting-pst.md
new file mode 100644
index 0000000..5f109d8
--- /dev/null
+++ b/data/converting-pst.md
@@ -0,0 +1,100 @@
+Some time ago I gave an introduction to converting Microsoft
+MSG files [1] to a readable RFC 2822 [2] format on Linux. In
+fact you will sometimes get an even kinkier format to work
+with: The Outlook Data File (PST) [3]. PST files is a
+proprietary format used by Microsoft Outlook, and is the
+equivalent of the mbox on Linux.
+**Edit August 29th**: Also have a look at the more
+up-to-date [4].
+Even though PST files are a bit harder to read than single
+EML files, there is hope if you only have a Linux client:
+libpst, and more specifically readpst. For libpst you need
+three libraries:
+* ``libgsf`` (i/o library that can read and write common file
+types and handle structured formats that provide
+file-system-in-a-file semantics)
+* boost (portable C++ source libraries)
+* libpst
+On OS X you can install it by:
+brew install libgsf
+brew install boost
+brew install libpst
+Now if you have a pst archive, like [5] for instance, you can
+convert it by:
+ mkdir export
+ readpst -M -b -e -o export "Personal Folders.pst"
+This should give an output like this:
+ Opening PST file and indexes...
+ Processing Folder "Deleted Items"
+ Processing Folder "Inbox"
+ Processing Folder "latest"
+ [...]
+ Processing Folder "Reports"
+ "Reports" - 11 items done, 1 items skipped.
+ Processing Folder "Quotes"
+ "Quotes" - 1 items done, 1 items skipped.
+ Processing Folder "Printer"
+ "Printer" - 1 items done, 1 items skipped.
+ Processing Folder "Passwords"
+ "Passwords" - 6 items done, 1 items skipped.
+ [...]
+ Processing Folder "Kum Team"
+ "Kum Team" - 37 items done, 0 items skipped.
+ "9NT1425(India 11.0)" - 228 items done, 1 items skipped.
+ Processing Folder "Jimmi"
+ "Jimmi" - 31 items done, 0 items skipped.
+ "Inbox" - 27 items done, 11 items skipped.
+ Processing Folder "Outbox"
+ Processing Folder "Sent Items"
+ "Sent Items" - 0 items done, 1 items skipped.
+ Processing Folder "Calendar"
+ "Calendar" - 0 items done, 6 items skipped.
+ Processing Folder "Contacts"
+ "Contacts" - 0 items done, 1 items skipped.
+ [...]
+ Processing Folder "Drafts"
+ Processing Folder "RSS Feeds"
+ Processing Folder "Junk E-mail"
+ Processing Folder "quarantine"
+ "My Personal Folder" - 13 items done, 0 items skipped.
+Which creates a directory structure like ``ls -l 'export/My
+Personal Folder'``:
+ drwxr-xr-x 2 - staff 68 Aug 28 21:34 Calendar
+ drwxr-xr-x 2 - staff 68 Aug 28 21:34 Contacts
+ drwxr-xr-x 29 - staff 986 Aug 28 21:34 Inbox
+ drwxr-xr-x 2 - staff 68 Aug 28 21:34 Journal
+ drwxr-xr-x 2 - staff 68 Aug 28 21:34 Sent Items
+ drwxr-xr-x 2 - staff 68 Aug 28 21:34 Tasks
+If you sample ``Inbox/Mails/``, you will find:
+ 1.eml 10.eml 11.eml 12.eml 13.eml 14.eml 15.eml 16.eml 17.eml 2.eml 3.eml 4.eml 5.eml 6.eml 7.eml 8.eml 9.eml
+You can now continue with our previous post [6]. I'll also
+encourage you to have a look at the documentation of the
+Outlook PST format [7].
+[1] Converting Microsoft MSG files: /2013-10-08-msg-eml.html
+[2] RFC 2822: http://tools.ietf.org/html/rfc2822
+[3] The Outlook Data File (PST): http://office.microsoft.com/en-001/outlook-help/introduction-to-outlook-data-files-pst-and-ost-HA010354876.aspx
+[4] libpff: /converting-pst-archives-in-os-xlinux-with-libpff
+[5] Example PST file: http://sourceforge.net/projects/pstfileup/files/Personal%20Folders.pst/download
+[6] Reading MSG and EML Files on OSX/Linux Command Line: :4443/forensics/reading-msg-files-in-linux-command-line/
+[7] The outlook.pst format: http://www.five-ten-sg.com/libpst/rn01re05.html
diff --git a/data/gpg-openssl.md b/data/gpg-openssl.md
new file mode 100644
index 0000000..0966397
--- /dev/null
+++ b/data/gpg-openssl.md
@@ -0,0 +1,126 @@
+## Key Takeaways
+* PGP are replaceable with native OpenSSL RSA public key crypto
+ and AES-256 keys.
+* This approach simplifies crypto operations, and only requires
+ OpenSSL which is widely available.
+* Existing PGP keys stored in GnuPG work with OpenSSL via `gpgsm`.
+## Introduction
+The rabbit hole mission of mine to get rid of PGP continues.
+Lately I have been looking into converting PGP keys from GnuPG to
+OpenSSL. This way I can send encrypted data to people not using my
+OpenSSL-only approach. After all, most people still depend on PGP
+and it is the format they publish their public keys in.
+## Exporting A PGP Public Key for Encryption Using OpenSSL
+A PGP key cannot be directly read by OpenSSL, but GPG can natively
+export to SSH and ssh-keygen to PKCS8:
+gpg --export-ssh-key ! > /tmp/test.pub
+ssh-keygen -f /tmp/test.pub -e -m PKCS8 > /tmp/test.pem
+The above pubkey can be used to encrypt data with OpenSSL as shown
+on my [contact page](https://contact.252.no):
+KEY=`openssl rand -hex 32` IV=`openssl rand -hex 16`
+ENCRYPTED_KEY_B64=`openssl pkeyutl -encrypt -pubin -inkey /tmp/test.pem -pkeyopt rsa_padding_mode:oaep <<< $KEY|base64`
+BLOB=`openssl enc -aes-256-cfb -a -e -K ${KEY} -iv ${IV} -in some-file`
+echo "PKCS11-VAULT;aes-256-cfb;rsa_padding_mode:oaep;$ENCRYPTED_KEY_B64:$IV:$BLOB;" > encrypted.txt
+The steps of the above are:
+1. Create an initialization vector [1] and an encryption key
+2. Encrypt the one-time key to test.pem (our exported PGP-key)
+3. Encrypt `some-file` using the key and IV using 256 bits AES in CFB-mode
+4. Format the output in my PV-format.
+Store `encrypted.txt` for decryption in the next section.
+## Exporting a PGP Private Key for Decryption Using OpenSSL
+This part is a bit more complex. For the sake of an example, let
+us say you received an encrypted blob with an IV and encrypted
+key, using the approach shown in the former section. You have the
+key stored in GnuPG.
+`gpgsm` can export your private key to p12, which is readable for
+OpenSSL [2].
+First list your secret keys in the GnuPG store: `gpg
+--list-secret-keys --with-keygrip`.
+Convert the key to X.509 by: `gpgsm --gen-key -o
+/tmp/temp.crt`. You need to fill the values requested:
+* Select "existing key"
+* Fill the keygrip from the GPG secret key listing. Make sure you
+ use the right key, since GPG generates several keys behind the
+ scenes (the encryption key)
+* Fill the cn (this needs to be on the format "cn=...") and e-mail
+* Accept the other values as empty and accept the creation
+Now import the certificate into `gpgsm`: `gpgsm --import
+/tmp/temp.crt`. When imported, find the key ID by: `gpgsm
+Using the key ID, you can now export the key in p12-format.
+gpgsm -o /tmp/$keyid.p12 --export-secret-key-p12 $keyid
+openssl pkcs12 -in /tmp/$key.p12 -nodes -nocerts|tail -n +5 > /tmp/$key.key
+You only need to do the conversion once and now have your key in
+`/tmp/$key.key`. This should be secured accordingly, and have a
+password set as is offered in the guidance by gpgsm.
+The resulting `/tmp/$key.key` is usable for decrypting content
+encrypted by the public key. To decrypt the data in `encrypted.txt`:
+for BLOB in ${ENCRYPTION_BLOBS[@]}; do
+ ENCRYPTED_KEY=`printf $ENCRYPTED_KEY_B64 | base64 -d`
+ decrypted=false
+ DECRYPTED_KEY=`echo $ENCRYPTED_KEY_B64 |base64 -d | openssl pkeyutl -decrypt -inkey /tmp/$key.key -pkeyopt ${PADDING_MODE} 2> /dev/null` && decrypted=true
+ if [ $decrypted != false ]; then
+ TEXTFILE_DEC=`printf %s "$TEXTFILE_ENC"|base64 -d|openssl enc -$ALGORITHM -d -K "$DECRYPTED_KEY" -iv "$IV" |base64`
+ break
+ fi
+The above format supports encryption to multiple parties. It:
+1. Reads the PV-format into variables
+2. Loops through the encryption blobs (one pass if one recipient)
+3. Decrypts the key with the private key generated from `gpgsm`
+4. Using the IV and decrypted key, decrypts the content, which is
+ eventually the same as in the previous section's `some-file`
+5. Prints the decrypted content
+## Conclusion
+It is possible to convert PGP keys to use with OpenSSL via `gpgsm`.
+Since OpenSSL is more widely distributed and installed than GnuPG,
+it is a method applicable in more environments.
+Using OpenSSL instead of GnuPG provides more options, and reduces
+the complexity of cryptography (since GnuPG has lots of options).
+[1] https://stackoverflow.com/questions/39412760/what-is-an-openssl-iv-and-why-do-i-need-a-key-and-an-iv
+[2] https://superuser.com/a/1414277
diff --git a/data/graph-experiment.md b/data/graph-experiment.md
new file mode 100644
index 0000000..af64f5f
--- /dev/null
+++ b/data/graph-experiment.md
@@ -0,0 +1,103 @@
+I currently maintain this threat database, and up until now I've
+generated the graph data for d3 using queries, and a lot of logic,
+in a MySQL-database. That is going to change pretty soon. You
+might also remember when we did Social Network Analysis and Object
+Attribution with Maltego 3 [1].
+In my seeking for understanding the Apache Hadoop ecosystem I all
+of a sudden got a brutal meeting with Java (Eclipse huh..). I also
+discovered that there are a world of libraries and applications
+previously unknown to me. One of them is the über-awesome Neo4j,
+which is a graph database originally built for Java - but guess
+what: It's got a REST API as well. As usual you don't have to
+write the Python code yourself, someone already wrote it for
+you. Note that it only does Python 2 for now [2,3].
+The coolest thing about Neo4j is Cypher [5]: Cypher is a "graph
+query language" as they put it themselves. With Cypher you can
+express what you look for in an entirely other way than you would
+do in a relational database, it's actually easy.
+And: You of course need the database running as well. If you use a
+Debian system like me your in luck since they have an experimental
+version out there [5].
+Enough talk, here is a simple example of how you could go about it
+in regard to scripting the relations considering threat
+intelligence in order to connect groups to incidents. The goal
+would be to find peripherally connected groups.
+ from GraphConn.Connect import Graph
+ g = Graph()
+ # create groups
+ g.cGroup("ThreatA")
+ g.cGroup("ThreatB")
+ g.cGroup("ThreatC")
+ # create incidents
+ g.cIncident("IncA")
+ g.cIncident("IncB")
+ g.cIncident("IncC")
+ # relate groups in some way to each other through incidents
+ g.link("ThreatA","IncA")
+ g.link("ThreatA","IncB")
+ g.link("ThreatB","IncC")
+ g.link("ThreatC","IncA")
+ g.link("ThreatB","IncB")
+ # find all threats related to Threat A through incidents
+ print g.fRelated("ThreatA")
+You might find this simple, but if you've ever tried to do it in
+SQL you know why you'll need it. Also, remember that this scales
+indefinite to other entity types as well.
+Here's the class used to generate the graph, for reference (feel
+free to copy it, produce something cool and post it back in the
+comment field):
+ from neo4jrestclient import client
+ from neo4jrestclient.client import GraphDatabase
+ from neo4jrestclient.query import Q
+ class Graph:
+ def __init__(self):
+ self.gdb = GraphDatabase("http://localhost:7474/db/data/")
+ self.nodes = []
+ def cGroup(self,name):
+ n = self.gdb.nodes.create(name=name, type='Group')
+ self.nodes.append(n)
+ def cIncident(self,name):
+ n = self.gdb.nodes.create(name=name, type='Incident')
+ self.nodes.append(n)
+ def link(self,n1,n2):
+ try:
+ l = (Q("name", iexact=n1)); n1 = self.gdb.nodes.filter(l)[0];
+ l = (Q("name", iexact=n2)); n2 = self.gdb.nodes.filter(l)[0];
+ return n1.relationships.create("Executed", n2)
+ except:
+ return False
+ def fRelated(self,query):
+ l = (Q("name", iexact=query))
+ n = self.gdb.nodes.filter(l)[0]
+ r = n.traverse()
+ for n2 in r:
+ for e in n2.traverse():
+ r.append(e)
+ return list(r)
+I really hope you enjoy this as much as me right now. The Facebook
+Graph Search for the rest of us.
+[1] gopher://secdiary.com/0/post/sna-oa-maltego/index.txt
+[2] https://pypi.python.org/pypi/neo4jrestclient/
+[3] https://neo4j-rest-client.readthedocs.org/en/latest/elements.html
+[4] http://www.neo4j.org/learn/cypher
+[5] http://debian.neo4j.org/
diff --git a/data/graphs-scale.md b/data/graphs-scale.md
new file mode 100644
index 0000000..2dbc035
--- /dev/null
+++ b/data/graphs-scale.md
@@ -0,0 +1,82 @@
+Following up on my post yesterday, I have also been looking at
+graphs the other way - from a scalable database to a manageable
+graph involving e.g. just one segment.
+There are currently two ways to do this:
+1) Export the graph, and 2) streaming the graph from and to the
+graph database. The first option is obviously the simple one, but
+doesn't always make up for our needs. The latter option is often
+the case when you work multiple analysts at the same graph.
+## Option 1: Exporting the Graph
+To achieve the first you can use the GraphML save function of
+ conf = new BaseConfiguration();
+ conf.setProperty("storage.backend","hbase");
+ conf.setProperty("storage.hostname","sandbox.hortonworks.com");
+ conf.setProperty("storage.port","2181");
+ g = TitanFactory.open(conf);
+ g.saveGraphML('test.graphml')
+This graph can again be opened in tools such as Gephi.
+You can also use the Gephi database API plugin for
+Rexster. There's a Blueprints repo [1] which extends that. Short
+how-to on how to get going with the Gephi development environment,
+from the wiki-pages of the plugin [2]:
+1. Get plugins from [3], and [4]
+2. Open Gephi, go to ``Tools > Plugins > Downloaded > "Add
+ Plugins..."``
+3. Press install and follow the guidance, at the end you should
+ restart Gephi
+4. Go to File > Import Database
+5. Add the Rexster configuration to ``/etc/graph/rexster.xml`` (if
+ when importing the database issues arises, look at [5]
+``rexster.xml`` should look like this:
+ RexterGraph
+ com.tinkerpop.rexster.config.RexsterGraphGraphConfiguration
+ 100
+You should be left with something like this for instance in Gephi:
+![A Rexster Graph Import to Gephi, from a Titan database. The graph consists of a variety of segments, such as articles from a article-system and imported Maltego graphs](/static/img/data/rexster-import-gephi.png)
+A Rexster Graph Import to Gephi, from a Titan database. The graph
+consists of a variety of segments, such as articles, imported
+Maltego graphs and such.
+A Rexster Graph Import to Gephi, from a Titan database. The graph
+consists of a variety of segments, such as articles from a
+article-system and imported Maltego graphs
+Here's the cluster on the right there by the way. There's some
+interesting patterns inside there it seems, so I suspect it's from
+a Maltego graph:
+## Option 2: The Gephi Streaming API
+For the other option I found the Gephi graph streaming API
+[6]. This one I currently found a little limited in that it can
+only provide collaboration between two Gephi instances using a
+Jetty web-server. It's pretty cool, but doesn't offer the
+integration I am looking for. I'll get back to this later.
+[1] https://github.com/datablend/gephi-blueprints-plugin
+[2] https://github.com/datablend/gephi-blueprints-plugin/wiki
+[3] https://github.com/downloads/datablend/gephi-blueprints-plugin/org-gephi-lib-blueprints.nbm
+[5] https://github.com/datablend/gephi-blueprints-plugin/issues/1
+[6] https://marketplace.gephi.org/plugin/graph-streaming/
diff --git a/data/indicators.md b/data/indicators.md
new file mode 100644
index 0000000..de851a8
--- /dev/null
+++ b/data/indicators.md
@@ -0,0 +1,463 @@
+Over what have become some years, cyber security
+professionals have been working on optimising the sharing of
+information and knowledge. A lot of the efforts have
+recently been focused around intelligence- and data-driven
+teams. Today many of these discussions have ended evolving
+around something related to the STIX format.
+> Don't use a lot where a little will do
+> – Unknown origin
+This post features a perspective of the potential of today's
+standard-oriented approach for documenting indicator sets
+related to cyber security threat actors and incidents. It
+turns out we have a longer way to go than expected.
+For the purpose of this article, an indicator is a
+characteristic or evidence of something unwanted, or hostile
+if you'd like. I like to refer to the military term
+"Indicators & Warnings" in this regard. In other words, an
+indicator isn't necessarily limited to the cyber domain
+alone either. Physical security could be in an even worse
+condition than cyber security when it comes to expressing
+threat indicators. I'll leave the cross-domain discussion
+for another time.
+## Up Until Today
+Multiple standards have evolved and disappeared, and one
+that I have been in favor of previously is the OpenIOC 1.1
+standard. However, times are changing, and so are the
+terminology and breadth of how we are able to express the
+intrusion sets.
+Even though OpenIOC was a very good start, and still is as
+far as I am concerned, it has far been surpassed Cybox and
+ultimately STIX [1] in popularity.
+STIX is a container, a quite verbose XML format (which is
+turning JSON in 2.0). Cybox is the artefact format [2], for
+malware you have MAEC [3] and so on. Basically it's a set of
+projects collaborating.
+This all sounds good, right? Not quite. Have a look at the
+OpenIOC to STIX repository on Github [4] and you will find
+that ``stuxnet.stix.xml`` is 202 lines of XML code for 18
+atomic indicators. OpenIOC on the other hand, is 91 lines,
+and that is a verbose format as well. In fact the overhead
+ratio of the STIX file is about 10:1, while OpenIOC is about
+To add to the mind-blowing inefficiency I have yet to see,
+on a regular basis, complex and nested expressions of an
+actor or a campaign in the STIX format.
+Before you continue, do a simple Google search for "STIX
+editor" and "cybox editor". Do it now, and while you are at
+it google for "openioc editor" as well. Hello guys, these
+standards have been going around for many years. So, how
+should we interpret that there aren't any user friendly
+approaches to using them? The closest I've come is through
+MISP, and that is generally speaking not using these
+standards for their internal workings either. This one on
+the MISP GitHub issue tracker says it all: STIX 2.x support
+(MISP) [5].
+I'm sure that some may disagree with the above statements,
+calling out the infancy of these formats. However, they
+can't be said to be new standards anymore. They are just too
+complex. One example of such is the graph-oriented relations
+implemented into the formats. Why not just let a graph
+database take care of these instead?
+This is not just a post to establish the current state. How
+would a better approach look?
+## What Is The Problem to Be Solved?
+Back to where things have gone since the OpenIOC 1.1/atomic
+indicator days. The most promising addition, in my opinion,
+is the MITRE PRE-ATT&CK and ATT&CK frameworks. The two
+frameworks builds on a less structured approach than seen
+for atomic indicators (Lockheed's Kill-Chain). The latter
+can for instance be viewed in form of the Intelligence
+The Intelligence Pyramid's abstraction levels can be mapped
+against what it is supposed to support when it comes to
+indicators like the following:
+ | Level of abstraction | | Supports
+ |-----------------------|----|-------------
+ | Behavior | | Knowledge
+ |-----------------------|--->|-------------
+ | Derived | | Information
+ |-----------------------|--->|-------------
+ | Atomic | | Data
+The purpose of the abstration layer is in this case to
+support assessments and measures at the corresponding
+contextual level. For instance a technical report tailored
+to an Incident Response Team (IRT) generally concerns
+Derived and Atomic indicators, while an intelligence report
+would usually be based on the Behavioural level.
+Having covered the abstraction layers, we can recognize that
+OpenIOC (or Cybox and MAEC) covers the bottom layers of
+abstration, while MITRE (PRE-)ATT&CK in its current form is
+mostly about the Behaviour level.
+For Derived indicators there are primarily two
+well-established, seasoned and successful formats that have
+become standards through its widespread usage. This is
+amongst others caused by the indicators and rules being
+effective, rapid, easy and pleasing to write.
+First we have Snort/Suricata rules and Lua scripts which was
+designed for network detection. For Snort/Suricata I'd say
+that most of what is detected of metadata today is probably
+expressable in OpenIOC (except for the magic that can be
+done with Lua). Second there is the Yara format which has
+become known for its applicability against malicious
+files. The simplicity of both formats is obviously due to
+their power of expression. Thus, I'd say that Yara and
+Snort/Suricata formats is the ones to look for when it comes
+to content and pattern detection.
+> Indicators should be easy and pleasing to write.
+To summarize the above, each of the formats can be mapped to
+an abstraction level:
+ | Level of abstraction | | Formats
+ |-----------------------|----|-------------
+ | Behavior | | MITRE (PRE-)ATT&CK
+ |-----------------------|--->|-------------
+ | Derived | | Suricata+Lua, Yara
+ |-----------------------|--->|-------------
+ | Atomic | | OpenIOC 1.1
+Going through my notes on how I document my own indicators I
+also found that I use the CVE database, datetimes,
+confidence, analyst comments for context and classification
+as well (the latter being irrelevant for detection).
+One of the major problems is: everything that is currently
+out there breaks the analyst workflow. You either need to
+log in to some fancy web interface, edit XML files (god
+forbid) or you would just jot down everything in a text
+file. The text file seems to be the natural fallback in
+almost any instance. I have even attempted to use the very
+good initiative by Yahoo, PyIOCe, and Mandiant's
+long-forgotten IOC Editor. These projects have both lost
+tracktion, as almost every other intiative in this space. So
+that is right folks, the text editor is still the preferred
+tool in 2018, and let's face it: indicators should be
+pleasing to design and create - like putting your signature
+to an incident or a job well done.
+> an indicator set should be for humans and machines by
+ humans
+After all, the human is the one that is going to have to
+deal with the indicator sets at some point, and we are the
+slowest link. So let us not slow ourselves down more than
+necessary. At this point I would like to propose the golden
+rule of creating golden rules: an indicator set should be
+for humans and machines by humans.
+You may also have noticed that when all these standards
+suddendly are combined into one standard, they become less
+user-friendly. In other words, let us rather find back to
+our common \*NIX roots where each tool had a limited set of
+Graphs are essential when writing indicators. Almost
+everything in the world around us can be modelled as a
+network, and infiltration and persistence in cyberspace is
+no exception. Thus, an indicator format needs to be
+representable in a graph, and guess what? Almost everything
+are as long as it maintains some kind of structure.
+For graphs there are two ways of going about the problem:
+1) Implement the graph in the format
+2) Make sure that you have a good graph backend and a
+automatable and traversable format available
+For option 1, the graph in the format will increase the
+complexity significantly. Option 2 results in the opposite,
+but that does not mean that it can't be converted to a
+graph. To make an elaborate discussion short, this is what
+we have graph databases for, such as Janusgraph [6].
+## A Conceptual View
+Summarizing the above, I'd like to propose the following
+requirements for indicator formats:
+1) Indicator sets should be easy and inviting to create
+2) You should be able to start writing at any time, when you
+need it
+3) Unnecessary complexity should be avoided
+4) The format should be human readable and editable
+5) A machine should be able to interpret the format
+6) Indicator sets should be graph compatible
+With a basis in this article, I believe that the best
+approach is to provide a basic plain text format
+specification that inherits from the OpenIOC 1.1 and MITRE
+frameworks and references other formats where necessary.
+Let us imagine that we found an IP address in one
+situation. The IP-address was connected to a domain that we
+found using passive DNS. Further, it was found that a
+specific file was associated with that domain through a
+Twitter comment. Representing the given information in its
+purest (readable) form looks like the following:
+ // a test file
+ class tlp:white
+ date 2018/02/18
+ ipv4 low
+ domain med secdiary.com
+ technique PRE-T1146
+ filename med some_filename.docx
+ comment found in open sources
+To recap some of the previous points: the above format is
+simple, it can be written at any time based on knowledge of
+well known standards. The best of it all is that if you are
+heavily invested in specific formats, it can be converted to
+them all using a simple interpreter traversing the format.
+Further, such a format is easily converted into a tree and
+can be loaded into a graph for traversing and automated
+assessments. Each confidence value can be quantified
+(``low=0.33``, ``med=0.66``, ``high=1.0``). That said,
+simplicity in this case equals actionable indicators.
+ | v: (0.33) | match |
+ | e | |
+ | v: secdiary.com (0.66) | no match | (0.33+0.66)/2=0.5
+ | e | |
+ | v: some_filename.docx (0.66) | match |
+For networks vs hierarchies: a drawback of the latter, as
+mentioned in the former section, is the lack of
+e.g. multiple domains being connected to different other
+vertices. A practical solution goes as follows:
+ ipv4 low
+ domain med secdiary.com
+ domain low secdiary.com
+ ipv4 low
+The graph receiving the above indicator file should identify
+the domain as being a unique entity and link the two IP
+addresses to the same domain:
+ | v: (0.33)
+ | e: 0.5
+ | v: secdiary.com (0.5)
+ | e: 0.33
+ | v: (0.33)
+As for structuring the indicator format for machines in the
+practical aspect, consider the following pseudocode:
+ indicators = [(0,'ipv4','low',''),...]
+ _tree = tree(root_node)
+ for indicator in indicators
+ depth = indicator[0]
+ _tree.insert(indicator,depth)
+Now that we have the tree represented in code, it is
+trivially traversable when loading it into some graph:
+ method load_indicators(node,depth):
+ graph.insert(node.parent,edge_label,node)
+ for child in node.children
+ load_indicator(child,depth+1)
+ load_indicators(tree,0)
+## Summary
+Hopefully I did not kill too many kittens with this
+post. You may or may not agree, but I do believe that most
+analysts share at least parts of my purist views on the
+We are currently too focused on supporting standards and
+having everyone use as few of them as possible. I believe
+that energy is better used on getting more consistent in the
+way we document and actually exchange more developed
+indicator sets than the md5 hash- and domainlists that are
+typically shared today ("not looking at these kinds of files
+at all" - even though it's not the worst I've seen:
+``MAR-10135536-F_WHITE_stix.xml`` [7]).
+In the conceptual part of this article I propose a simple
+but yet effective way of representing indicators in a
+practical manner. Frankly, it is even too simple to be
+novel. It is just consistent and intutitive.
+PS! For the STIX example above, have a look at the following
+to get a feel with the actual content of the file (used one
+of the mentioned specimens to show the point):
+ class tlp:white
+ date 2018/02/05
+ sha1 high 4efb9c09d7bffb2f64fc6fe2519ea85378756195
+ comment NCCIC:Observable-724f9bfe-1392-456e-8d9b-c143af15f8d4
+ comment did not convert all attributes
+ compiler Microsoft Visual C++ 6.0
+ md5 high 3dae0dc356c2b217a452b477c4b1db06
+ date 2016-01-29T09:21:46Z
+ entropy med 6.65226708818
+ #sections low 5
+ intname med ProxyDll.dll
+ detection med symantec:Heur.AdvML.B
+The original document states for those same indicators in no less than 119 lines
+with an overhead ratio of about 1:5 (it looks completely insane):
+ 3DAE0DC356C2B217A452B477C4B1DB06
+ 336073
+ PE32 executable (DLL) (console) Intel 80386, for MS Windows
+ MD5
+ 3dae0dc356c2b217a452b477c4b1db06
+ SHA1
+ 4efb9c09d7bffb2f64fc6fe2519ea85378756195
+ SHA256
+ 8acfe8ba294ebb81402f37aa094cca8f914792b9171bc62e758a3bbefafb6e02
+ SHA512
+ e52b8878bd8c3bdd28d696470cba8a18dcc5a6d234169e26a2fbd9862b10ec1d40196fac981bc3c5a67e661cd60c10036321388e5e5c1f60a7e9937dd71fadb1
+ 3072:jUdidTaC07zIQt9xSx1pYxHvQY06emquSYttxlxep0xnC:jyi1XCzcbpYdvQ2e9g3kp01C
+ Microsoft Visual C++ 6.0
+ Microsoft Visual C++ 6.0 DLL (Debug)
+ 6.65226708818
+ 5
+ 2016-01-29T09:21:46Z
+ 4096
+ MD5
+ e14dca360e273ca75c52a4446cd39897
+ 0.672591739631
+ .text
+ 49152
+ 6.41338619924
+ MD5
+ 076cdf2a2c0b721f0259de10578505a1
+ .rdata
+ 8192
+ 3.293891672
+ MD5
+ 4a6af2b49d08dd42374deda5564c24ef
+ .data
+ 110592
+ 6.78785911234
+ MD5
+ c797dda9277ee1d5469683527955d77a
+ .reloc
+ 8192
+ 3.46819043887
+ MD5
+ fbefbe53b3d0ca62b2134f249d249774
+[1] STIX: https://oasis-open.github.io/cti-documentation/
+[2] Cybox example: https://github.com/CybOXProject/schemas/blob/master/samples/CybOX_IPv4Address_Instance.xml
+[3] MAEC: https://maec.mitre.org/
+[4] OpenIOC to STIX repository on Github: https://github.com/STIXProject/openioc-to-stix
+[5] STIX 2.x support (MISP): https://github.com/MISP/MISP/issues/2046
+[6] Janusgraph: http://janusgraph.org/
+[7] MAR-10135536-F_WHITE_stix.xml: https://www.us-cert.gov/sites/default/files/publications/MAR-10135536-F_WHITE_stix.xml
diff --git a/data/jnetpcap-tuning.md b/data/jnetpcap-tuning.md
new file mode 100644
index 0000000..74a0a66
--- /dev/null
+++ b/data/jnetpcap-tuning.md
@@ -0,0 +1,84 @@
+It comes a time when programming that one will have to start
+paying attention to performance. As this is true in many cases,
+there are especially two places that is especially important: With
+parallel processing and packet captures. Even better if doing both
+at once. In this article we'll keep the latter in mind together
+with jNetPcap, a Java wrapper for libpcap able to do 60Kpps per
+First of all I found an excellent post on performance tuning
+jNetPcap. There's also a good implementation example for moving to
+the much faster ``JBufferHandler`` [1].
+One should take note of the ring buffer, that is how much memory
+you will have to temporarily store packets if there's a lot of
+traffic. Usually this may be e.g. 453k, while the maximum can be
+4M (for instance 4078 as it was in my case). For tuning this on
+RedHat one may use ``ethtool -g eth0``, and adjust it with
+``ethtool -G eth0 rx 4078``. Larger buffers results in high
+throughput, but also higher latency (which is not that important
+when doing packet captures). More on ethtool and ring buffer
+adjustments here.
+When it comes to jNetPcap, the following is an example
+implementing it as a Apache Flume source [2]:
+ @Override
+ public void start() {
+ final ChannelProcessor channel = getChannelProcessor();
+ JBufferHandler jpacketHandler = new JBufferHandler() {
+ public void nextPacket(PcapHeader pcapHeader, JBuffer packet, ChannelProcessor channelProcessor) {
+ int size = packet.size();
+ JBuffer buffer = packet;
+ byte[] packetBytes = buffer.getByteArray(0, size);
+ Event flumeEvent = EventBuilder.withBody(packetBytes);
+ channel.processEvent(flumeEvent);
+ }
+ };
+ super.start();
+ pcap.loop(-1, jpacketHandler, channel);
+ }
+The above shows you a slightly different version than the most
+well-documented example (``PcapHandler``) [3]. You should choose
+the above one since it is much faster due to the packet
+referencing. I did a test on one site and the performance
+increased drastically in terms of improving packet loss on the
+software-side of things.
+Last but not least, in order to do software side performance
+monitoring, you might want to add a handler to capture statistics
+in jNetPcap. This is mentioned here in the jNetPcap forums as well
+> You can also use PcapStat to see if libpcap is dropping any
+> packets. If the buffer becomes full and libpcap can't store a
+> packet, it will record it in statistics. This is different from
+> the NIC dropping packets.
+This may be implemented in the configuration as shown here:
+ PcapStat stats = new PcapStat();
+ pcap = Pcap.openLive(device.getName(), SNAPLEN, Pcap.MODE_PROMISCUOUS, timeout, errbuf);
+ pcap.stats(stats);
+You can get the stats with the following:
+ System.out.printf("drop=%d, ifDrop=%d\n",stats.getDrop(), stats.getIfDrop());
+Hope this gets you up and running smoothly, tuning packet captures
+in chain with parallel computing is a challenge.
+To get some more context you may also like to have a look at the
+presentation that Cisco did on OpenSOC, that's how to do it.
+[1] http://jnetpcap.com/node/67
+[2] http://flume.apache.org/
+[3] http://jnetpcap.com/examples/dumper
+[4] http://jnetpcap.com/node/704
diff --git a/data/mac-mini-debian.md b/data/mac-mini-debian.md
new file mode 100644
index 0000000..8e2fb1c
--- /dev/null
+++ b/data/mac-mini-debian.md
@@ -0,0 +1,173 @@
+There are a lot of guides on booting Linux on an Mac Mini, and the
+Mac Mini is absolutely great. There's also a lot of guides which
+takes some unnecessary steps on the way from the native OS X
+experience to the bloated, and difficult-to-setup Linux on OS
+X. Some of them are good on certain points though.
+So, not surprising, I will tell you how to make it work with both
+a native EFI installation and the Broadcom BCM4366 up and running.
+Everything will be done on the command line, so this will work
+great on servers as well. Of course you won't run wifi on the work
+server though (!).
+First, take note that this will wipe almost everything Apple from
+you box except the Firmware. You may roll back through pressing
+the ALT-key while booting.
+Second, you should use Debian 8.0 "Jessie" (which is currently in
+RC1). This is important since Wheezy doesn't support the Broadcom
+Prerequisites for this article are:
+* A Mac Mini, tested on an OCT 2014 model
+* A keyboard
+* A USB memory stick of at least 2GB (speed is the key)
+## 1. Install Debian - and Change Boot Order
+You should create a bootable USB stick for your Debian
+installation. When you've downloaded the ISO, you can make it
+bootable without hassle through Unetbootin [1]. That one works on
+OS X 10.10 "Yosemite" as well.
+When you've got that one ready insert it into the Mini, holding
+the ALT-key while booting. You will get to the boot menu, choose
+the "EFI" one. This will initiate GRUB from the stick.
+Do the installation as you would on any other machine. Since your
+mac is still setup to boot to OS X, we need to change that next in
+order to make it point to the Debian installation instead.
+When rebooting, get into the boot menu by holding the ALT-key
+again. Select that same GRUB menu again, _BUT_ instead of choosing
+to install it you should now press "c" to get to the GRUB command
+It is now time to locate the boot directory [2] on the right
+disk. Vary X (disk) and Y (partition table) until you find the
+right combination:
+ grub> ls (hdX,gptY)/boot/grub
+That may for instance result in:
+ grub> ls (hd2,gpt2)/boot/grub
+Set the ``root`` to that disk and partition table, and boot it:
+ grub> set root=(hd2,gpt2)
+ grub> ls -l (hd2,gpt2)
+ grub> linux /boot/vmlinux[...].efi.signed root=UUID=[uuid from above command]
+ grub> initrd /boot/initrd[...]
+ grub> boot
+You will now boot to the one you just installed. It is time to
+make it persistent and change the boot order with
+``efibootmgr``. First list your current settings by:
+ sudo efibootmgr
+Now change the boot order (may vary, point being that Debian
+should come first):
+ sudo efibootmgr -o 0,1
+Now reboot and enjoy the darkness without wifi.
+## 2. Get Wifi Up and Running (Offline)
+The current Broadcom chipset is quite new, so you'll need to step
+it up to Debian "Jessie" to get it working. Cutting this a bit
+short, you will probably need this part to be offline. Showing you
+a small trick you can get all those dependencies on a vmware
+installation (run the same image as the one you installed,
+remember to simulate that you don't have network on that virtual
+ apt-get -qq --print-uris install build-essential linux-headers-$(uname -r) broadcom-sta-dkms patch bzip2 wpasupplicant | cut -d\' -f 2 > urls.txt
+This will produce a file of urls that are all the packages
+requested and its dependencies, get the stick, format it with
+FAT - and grab the packages to it:
+ wget -i urls.txt
+Unmounting that from the virtual installation, insert it into the
+physical installation:
+ cd /mnt/usb
+ dpkg -i *.deb
+Remove all modules that may conflict (and blacklist them in
+ modprobe -r b44 b43 b43legacy ssb brcmsmac
+Load the Broadcom module:
+ modprobe wl
+ echo wl >> /etc/modules
+Everything that's left now is configuring and starting
+ wpa_passphrase [passphrase] > /etc/wpa_supplicant.conf
+ wpa_supplicant -B -i wlan0 -c /etc/wpa_supplicant.conf
+To make it persistent enable the interface in
+``/etc/network/interfaces`` by appending:
+ auto wlan0
+ iface wlan0 inet dhcp
+ wpa-conf /etc/wpa_supplicant.conf
+If you have made an exception in your DHCP pool, you should also
+make it static (basic stuff, but anyways):
+ auto wlan0
+ iface wlan0 inet static
+ wpa-conf /etc/wpa_supplicant.conf
+ address
+ netmask
+ gateway
+That's basically it. Enjoy the show!
+**Edit 1, FEB 7th 2015:** So I got to play with ``systemd``, since
+it turns out a service isn't a service the way it used to be. In
+order to start services in Debian "Jessie", you'll need to use
+``systemd``. Here's an example for ``znc`` [3]:
+ [Unit]
+ Description=An advanced IRC bouncer
+ After=network.target oidentd.socket
+ [Service]
+ Type=simple
+ EnvironmentFile=/etc/conf.d/znc
+ User=znc
+ ExecStart=/usr/bin/znc -f $ZNC_OPTIONS
+ ExecReload=/bin/kill -HUP $MAINPID
+ [Install]
+ WantedBy=multi-user.target
+Also create the directory and drop the following line into
+``/etc/conf.d/znc``: ``ZNC_OPTIONS="-d /var/lib/znc"``
+**Edit 2, FEB 7th 2015:** To enable the Mac Mini to auto-restart
+after power failure set the following PCI value [4]:
+ setpci -s 0:1f.0 0xa4.b=0
+[1] http://unetbootin.sourceforge.net/
+[3] https://gist.github.com/tlercher/3897561
+[4] http://smackerelofopinion.blogspot.no/2011/09/mac-mini-rebooting-tweaks-setpci-s-01f0.html
diff --git a/data/maltego-search.md b/data/maltego-search.md
new file mode 100644
index 0000000..7b38fb7
--- /dev/null
+++ b/data/maltego-search.md
@@ -0,0 +1,52 @@
+I've previously been writing on how to read and process Maltego
+mtgx graph archives. When you start to get a directory with a lot
+of them you will probably be like me "Where did I see this thing
+The solution can of course be done in Python like in my previous
+post, but let's try a more native solution this time, zipgrep:
+> zipgrep will search files within a ZIP archive for lines
+> matching the given string or pattern. zipgrep is a shell script
+> and requires egrep(1) and unzip(1L) to function. Its output is
+> identical to that of egrep(1).
+In my testing I had 20 files, and everything worked pretty well in
+regard to searching the files by e.g. ``zipgrep \*.mtgx
+\*.graphml``. The problem here being that zipgrep doesn't seem to
+support printing the archive names, so thank you for
+that. Returning to the more basic zip tools, like zip cat was the
+solution in my case:
+ unzip -c \*.mtgx 2>&1 |egrep "(Archive: )|"
+ Archive: 1.mtgx
+ Archive: 2.mtgx
+ Archive: 3.mtgx
+ Archive: 4.mtgx
+ Archive: 5.mtgx
+ Archive: 6.mtgx
+ Archive: 7.mtgx
+ Archive: 8.mtgx
+ Archive: 9.mtgx
+ Archive: 10.mtgx
+ Archive: 11.mtgx
+ Archive: 12.mtgx
+ Archive: 13.mtgx
+ Archive: 14.mtgx
+ Archive: 15.mtgx
+ Archive: 16.mtgx
+ Archive: 17.mtgx
+ Archive: 18.mtgx
+ Archive: 19.mtgx
+ Archive: 20.mtgx
+A little Maltego archive insight helps us along speeding up the
+query, since the graphml file will always stay at
+ unzip -c \*.mtgx Graphs/Graph1.graphml 2>&1 |egrep "(Archive: )|"
+The latter results in the same results as given above.
diff --git a/data/matrix.md b/data/matrix.md
new file mode 100644
index 0000000..8ce0334
--- /dev/null
+++ b/data/matrix.md
@@ -0,0 +1,199 @@
+We have all been there during security operations. One of the
+parties involved in an incident or daily routine is not prepared
+for thinking they could be compromised.
+Communications and information sharing is one of the fundamental
+things that you need to get right during a crisis.
+As now-retired FBI director James Comey put it to 60 minutes [1]:
+> There are two kinds of big companies in the United States. There
+> are those who've been hacked by the Chinese and those who don't
+> know they've been hacked by the Chinese.
+The following question always arises: How do we maintain
+operational security while still being able to communicate with
+all parties involved?
+In practical terms this requires a communications platform to:
+* Be independent of the service infrastructure
+* Provide traceability
+* Be resistant to resourceful threat actors
+* Have simple and secure identity management
+* Have cross-platform compability
+* Provide file-sharing capabilities and ability to give the user
+ an opportunity to express himself
+* Support video and audio exchanges
+* Be under the control of the team using it (the smallest circle
+ of trust)
+* Provide both end-to-end and transport layer encryption
+* Disposable server infrastructure
+This could have been a bit too much to ask for a couple of years
+ago, but today there are at least two alternatives satisfying the
+above requirements: Mattermost and the Matrix ecosystem. For the
+remainder of this post I will focus on how to establish an ad-hoc
+system with the tools provided by the Matrix project.
+## Setting Up An Out-of-Band Channel for Incident Handling with Matrix
+Getting started takes three steps:
+1. Establish a back-end server on Digital Ocean
+2. Serve the Riot front-end website
+3. Establish a recording capability with Matrix Recorder [2]
+For the two first points, it is clever to use an approach that can
+be easily reproduced and that provides exactly the same,
+secure-by-default configuration each time. Due to this the
+preferred method in this case is to manage the VPS that can be
+established on anything with Debian or CentOS with Ansible. There
+is a script available on Github, known as
+matrix-docker-ansible-deploy [3]. The latter have also been
+endorsed by the Matrix project [4]. Both 1 and 2 can be
+accomplished with ``matrix-docker-ansible-deploy``.
+So let's get started.
+### Basic DNS-service
+For this example I created a domain on namesilo.com and pointed
+that to ``(ns1|ns2|ns3).digitalocean.com``. It would be ufortunate
+for the continuity of the service if a domain was taken offline or
+redirected somewhere, but due to the end to end encryption in
+Matrix it would not compromise the content of the
+conversations. Now that Digital Ocean has control of the primary
+domain, make sure to add the following before continuing:
+ Type Hostname Value TTL
+ A 600
+ A riot. 600
+ A matrix. 600
+ SRV _matrix._tcp. 10 0 8448 matrix. 600
+This can take some time to propagate, so make sure that the
+DNS-infrastructure is readily resolvable before you continue
+deploying the services.
+### Configure
+Make sure to grab a copy of the current
+``matrix-docker-ansible-deploy`` by running:
+ git clone https://github.com/spantaleev/matrix-docker-ansible-deploy.git
+Create the following files:
+ inventory/host_vars/matrix./vars.yml
+ inventory/hosts
+``vars.yml`` should look like this:
+ host_specific_matrix_ssl_support_email:
+ host_specific_hostname_identity:
+ matrix_coturn_turn_static_auth_secret: ""
+ matrix_synapse_macaroon_secret_key: ""
+The Ansible ``hosts`` file should be formatted like the following:
+ all:
+ children:
+ matrix-servers:
+ hosts:
+ matrix.:
+ ansible_user: root
+### Deploy and Execute
+Now that your configuration files and server are ready, you can
+start deploying the Matrix Synapse server and start serving the
+Riot HTML/JS client.
+First deploy the services (Riot and Matrix Synapse) by running:
+ ansible-playbook -i inventory/hosts setup.yml --tags=setup-main
+When that completes successfully, you can start the services by:
+ ansible-playbook -i inventory/hosts setup.yml --tags=start
+After starting the services, the Riot web interface is available
+on ``https://riot.`` where metadata is protected by a
+Let's Encrypt certificate.
+The two primary endpoints you now have exposed to the WWW is:
+* The Matrix API which runs at https://matrix.
+* The Riot UI which runs at https://riot.
+Going to ``https://riot.`` brings you to the Riot
+### Adding Users
+Registration is disabled by default on the server, so new users
+can be added by the following command:
+ ansible-playbook -i inventory/hosts setup.yml
+ --tags=register-user
+ --extra-vars='username=
+ password=
+ admin=(yes|no)'
+It is better to use pseudonyms on such a platform to make sure no
+information can be traced to a specific individual not involved in
+the case. Each user needs to verify his private key fingerprint
+with the other participants.
+### Vital Steps to Take as an Administrator
+When using multiple servers, it is necessary to create an
+``#control`` channel that is a fallback if a server hosting a room
+goes down.
+### Setup Matrix Recorder
+To make sure that all communications is stored for traceability
+make sure to install the Matrix Recorded (MR). MR should be
+installed locally and _not_ on the Matrix server.
+ git clone https://gitlab.com/argit/matrix-recorder.git
+ cd matrix-recorder/
+ npm install
+To execute the recorder, run the following. The first time you
+will be asked to enter the login credentials of the user.
+ $ node matrix-recorder.js
+ Loading olm...
+ Your homeserver (give full URL): https://matrix.
+ Your username at the homeserver:
+ Your password at the homeserver:
+ No of items to retrieve for initial sync: 1000
+ [...]
+View messages as HTML by running the Matrix Recorder conversion
+ node recorder-to-html.js
+### Controlling Logins
+Access monitoring can be done in the console by e.g. ``tail -f
+### The Power of Disposability
+At some point you have finished the information exchange. The
+beauty of this setup is that is can now be safely deleted from the
+Digital Ocean droplet console.
+[1] James Comey and 60 minutes: https://www.cbsnews.com/news/fbi-director-james-comey-on-threat-of-isis-cybercrime/
+[2] Matrix Recorder: https://matrix.org/docs/projects/other/matrix-recorder.html
+[3] matrix-docker-ansible-deploy: https://github.com/spantaleev/matrix-docker-ansible-deploy
+[4] Matrix project endorsement: https://matrix.org/blog/2018/06/01/this-week-in-matrix-2018-06-01/
diff --git a/data/microsoft-dominating-email.md b/data/microsoft-dominating-email.md
new file mode 100644
index 0000000..762691b
--- /dev/null
+++ b/data/microsoft-dominating-email.md
@@ -0,0 +1,159 @@
+## Key Takeaways
+* While market dominance was formerly an issue discussed for
+ operating systems, the modern equivalent occurs in form of cloud
+ services, primarily from Microsoft, Amazon and Google.
+* Data from the Norwegian business registry mapped to email
+ records shows that Microsoft Office 365 has become a dominating
+ force amongst Norwegian private businesses and 61% of the
+ government.
+* Microsoft being a significant actor for email indicates that
+ Norwegian organisations are putting a lot more faith in
+ Microsoft. Today email as a service is bundled with direct
+ messaging and wikis.
+## Introduction
+In 2003 Dan Geer, Bruce Schneier and others wrote a paper named
+"How the Dominance of Microsoft's Products Poses a Risk to
+Security". It eventually cost Geer his job at AtStake.
+The paper evolves around Microsoft's dominance in operating
+systems and Geer has later given Microsoft credit for a better
+approach to security [2].
+In this article I am not going to reiterate on the points made by
+Geer et àl. I think these are perfectly valid and easily
+transferrable to the current landscape. The whole paper is
+read-worthy, but I'd like highlight one part:
+> Governments, and perhaps only governments, are in leadership
+> positions to affect how infrastructures develop. By enforcing
+> diversity of platform to thereby blunt the monoculture risk,
+> governments will reap a side benefit of increased market
+> reliance on interoperability, which is the only foundation for
+> effective incremental competition and the only weapon against
+> end-user lock-in. A requirement that no operating system be more
+> than 50% of the installed based in a critical industry or in a
+> government would moot monoculture risk. Other branches to the
+> risk diversification tree can be foliated to a considerable
+> degree, but the trunk of that tree on which they hang is a total
+> prohibition of monoculture coupled to a requirement of
+> standards-based interoperability.
+Azure is Windows in 2021. The walled gardens are somewhat
+redefined - but they are there in a similar fashion as Windows was
+in 2003. The Microsoft monopoly is technically broken, and there
+are now options from Amazon, Google and even Apple, but I would
+argue the monoculture is still present in shared approaches,
+infrastructure and concepts.
+I decided to have a closer look at the distribution from a
+representative dataset provided by an authorative source in
+Norway; the business registry.
+## Taking a Close Look at The Data
+In Norway we a public registry of organisations. This registry is
+categorised by standardised sector codes (typically "government",
+"private" and so on). Using the JSON-data provided by brreg.no, a
+list of websites can be extracted:
+ 1. Retrieve the organisation list from brreg.no [1]
+ curl https://data.brreg.no/enhetsregisteret/api/enheter/lastned > enheter.gz
+ gzip -d enheter.gz
+ 2. Reshape the JSON data by website URL, sector and business code.
+ cat enheter |
+ jq '[.[] | select(.hjemmeside != null) | {url:.hjemmeside, code:.naeringskode1.kode, sector:.institusjonellSektorkode.kode}]' > webpages.txt
+ 3. Based on the URL, add the primary domain and resolve its MX
+ record and the MX primary domain to each JSON entity
+ 4. Using the JSON-file generated above, populate the following
+ JSON dictionary. This is also a rough categorisation based on
+ the standard provided by Statistics Norway (I'm sure it could
+ be improved) [4]:
+ {
+ "government":{"codes": [6100,6500,1110,1120], "total":0, "counts":{}},
+ "municipals":{"codes": [1510,4900,1520], "total":0, "counts":{}},
+ "finance":{"codes": [3200,3500,3600,4300,3900,4100,4500,4900,5500,5700,4900,7000], "total":0, "counts":{}},
+ "private":{"codes": [4500,4900,2100,2300,2500], "total":0, "counts":{}}
+ }
+ 5. Generate CSV output based on each sector grouping above.
+## The Result
+The top vendor was not surprising Microsoft's outlook.com. For the
+120k sites, 98k resolved an MX record. Of these I will give an
+outlook.com summary as follows, as it would seem this is the
+dominating actor in all categories:
+* In government 61% is O365 users (1420/2317)
+* For municipals, the amount is 55% (688/1247)
+* For the diverse financial grouping, 21% uses O365 (4836/23125)
+* For the diverse private companies 38% uses O365 (14615/38129)
+Of the 98k sites Microsoft runs the email service for 21559
+organisations. For comparison Google MX domains accounts
+for about 5500.
+While the above are directly a measurement of who delivers email
+services, it also indicated that these organisations relies on
+other services, such as internal wikis and direct messaging.
+An overview of the top 10 vendors are shown below.
+## Sources of Errors
+Even though I believe the statistics above is representative it
+has some possible sources of error:
+1. The organisation isn't listed with URL in the organisation
+ registry or it uses a domain not associated with the primary
+ domain of its web address
+2. The organisation uses an SMTP proxy
+3. The organisation has an inactive SMTP record
+I found that there are more than 1 million listed organisations in
+the brreg.no registry and 120k websites in the JSON data
+provided. This means this dataset represent at most 12% of the
+companies listed.
+Also, email doesn't represent a diverse infrastructure, but I
+believe it is an indicator of the current trends also for other
+cloud services in e.g. Azure, Google Compute Engine and so on.
+[1] CyberInsecurity: The Cost of Monopoly, Geer et àl, 2003 -
+[2] Cybersecurity as Realpolitik by Dan Geer presented at Black
+Hat USA 2014: https://www.youtube.com/watch?v=nT-TGvYOBpI
+[3] https://data.brreg.no/enhetsregisteret/api/enheter/lastned
+[4] https://www.ssb.no/klass/klassifikasjoner/39
diff --git a/data/msg-eml.md b/data/msg-eml.md
new file mode 100644
index 0000000..e26798c
--- /dev/null
+++ b/data/msg-eml.md
@@ -0,0 +1,58 @@
+Thought I’d share a neat little script-combo if you do your
+email analysis on Linux systems, or do automation. For the
+task you’ll need msgconvert.pl [1] and ripmime [2].
+MSG files are used by Microsoft Outlook, and is the natural
+fit in regard to malicious messages in organizations running
+Microsoft products. For reference you can find the
+specification for the Outlook Item File Format here.
+In this part you will require a file from Outlook, which you
+can acquire by selecting a message and drag it to the
+desktop or a new message. If you don’t do Outlook, you can
+just google for one [3].
+ msgconvert.pl .msg
+ ripmime -i .mime
+The above will first convert the MSG file to a mime
+file. The latter command will make sure to extract the
+objects in it, such as binary files or documents. The text
+files contains the content of the email and will be
+something like: textfile0
+If you need the headers you will find them at the top of the
+Now to EML-files, which you will also often find when
+exporting email messages. EML is really just short for
+“E-mail”. In OS X Mail, Outlook Express, Thunderbird (and
+others) you are typically presented with EML/MIME-formatted
+documents, and it’s just a document which complies with RFC
+822 [4]. EML-files are more easy to work on since you can
+open it in a text editor and read the essential information
+plain straight away.
+So what does that mean in regard to ripmime? It really just
+means that instead of calling the output from msgconvert.pl
+.mime, you can name the file .eml. In
+ ripmime -i .eml
+The above will output your mime parts.
+## OS X Specifics
+You may want to do the above on an OS X system as well. For
+this you can install ripmime via Homebrew [5].
+If you are exporting an eml from Apple Mail you may do so
+the same way as in Outlook: Just drag it where you want it.
+[1] https://www.matijs.net/software/msgconv/
+[2] https://www.pldaniels.com/ripmime/
+[3] https://www.google.com/search?q=filetype:msg&oq=filetype:msg#q=filetype:msg+outlook
+[4] https://tools.ietf.org/html/rfc822
+[5] https://brew.sh/index_nb
diff --git a/data/new-format.md b/data/new-format.md
new file mode 100644
index 0000000..02a9fc0
--- /dev/null
+++ b/data/new-format.md
@@ -0,0 +1,70 @@
+After being off the HTML grid for a while, using Hugo as a
+static site generator for Gopher. I went tired of the
+upgrade and complexity issues with publishing new
+content. It all culminated with Hugo refusing to generate
+the site at all after the last update.
+Because of the Hugo failure I needed to create a new
+strategy, and not being willing to change to another complex
+generator system I went hunting for something else.
+I am happy with my current backend publishing setup, which
+uses git and a post-receive hook:
+ pwd=$(pwd)
+ if test -z "${pwd##*.git}"
+ then repo="$pwd"
+ else repo="$pwd/.git"
+ fi
+ git --work-tree=~/secdiary/content --git-dir=~/secdiary/content.git checkout -f
+ cd ~/secdiary
+ rm -r /var/www/secdiary.com/*
+ rm -r /var/gopher/*
+ cp -R html/* /var/www/secdiary.com/
+ cp -R gopher/* /var/gopher/
+ cp ~/twtxt/content/twtxt.txt /var/www/secdiary.com/
+ echo "\nBuild: " >> /var/gopher/index.gph
+ git --git-dir=~/secdiary/content.git log -1 --pretty="%H%n%ci" >> /var/gopher/index.gph
+I also publish twtxt messages in a similar way. My twtxt
+config looks like the following:
+ [twtxt]
+ nick = tommy
+ twtfile = ~/twtxt/twtxt.txt
+ twturl = http://secdiary.com
+ disclose_identity = False
+ character_limit = 140
+ character_warning = 140
+ post_tweet_hook = "cd ~/twtxt/ && git pull && git add twtxt.txt && git commit -m 'added new tweet' && git push"
+In addition to my twtxt feed, I am present on Mastodon,
+which lead me to Solene's static site generator cl-yag
+[1,2]. I decided to generate the site client-side for
+now, but in the future I'll likely move this to the server
+for less complex workflows on my workstations. This also
+fits me well since I'll be moving more of my workflow to
+OpenBSD in the coming months.
+The layout of my new site is more or less shamelessly stolen
+from Solene as well. I plan to customize that to my liking as
+we go.
+And with that I am back in the WWW space, however in a
+limited format. I am currently reviewing my 50 current
+posts and will assess what can be of use in the future. This
+will involve some rewriting as well, since this space will
+be text-only out of respect for your time.
+I also enabled TLS on the site for those that would like to
+browse privately, opposed to my current Gopher setup. The
+latter you may find on ``gopher://secdiary.com``.
+Feel free to reach out to me in the Fediverse. I'm there as
+[1] https://dataswamp.org/\~solene/2018-10-12-cl-yag-20181012.html
+[2] git://bitreich.org/cl-yag
diff --git a/data/novel-pdf-detection.md b/data/novel-pdf-detection.md
new file mode 100644
index 0000000..45d65fe
--- /dev/null
+++ b/data/novel-pdf-detection.md
@@ -0,0 +1,792 @@
+For some time now the Portable Document Format standard has
+been a considerable risk in regard to corporate as well as
+private information security concerns. Some work has been
+done to classify PDF documents as malicious or benign, but
+not as much when it comes to clustering the malicious
+documents by techniques used. Such clustering would provide
+insight, in automated analysis, to how sophisticated an
+attack is and who staged it. A 100.000 unique PDF dataset
+was supplied by the Shadowserver foundation. Analysis of
+experiment results showed that 97% of the documents
+contained javascripts. This and other sources revealed that
+most exploits are delivered through such, or similar object
+types. Based on that, javascript object labeling gets a
+thorough focus in the paper.
+The scope of the paper is limited to extend the attribution
+research already done in regard to PDF documents, so that a
+feature vector may be used in labeling a given (or a batch)
+PDF to a relevant cluster. That as an attempt to recognize
+different techniques and threat agents.
+> Javascript is currently one of the most exploited PDF
+ objects. How can the PDF feature vector be extended to
+ include a javascript subvector correctly describing the
+ technique/style, sophistication and similarity to previous
+ malicious PDF documents. How does it relate to the term
+ digital evidence?
+> — Problem statement
+The problem statement considers the coding styles and
+obfuscation techniques used and the related sophistication
+in the coding style. Least but most important the statement
+involves how the current PDF document measures to others
+previously labeled. These are all essential problems when it
+comes to automatated data mining and clustering.
+### A. Related Work
+Proposed solutions for malicious contra benign
+classification of PDF documents has been explicitly
+documented in several papers. Classification using support
+vector machines (SVM) was handled by Jarle Kittilsen in his
+recent Master's thesis1.
+Further, the author of this paper in his bachelor's thesis2
+investigated the possibility to detect obfuscated malware by
+analyzing HTTP data traffic known to contain malware. In
+regard, the findings were implemented, designed and tested
+in Snort. Some of the detection techniques will be used as a
+fundament for labeling in this paper.
+Even though much good work has been done in the era of
+analyzing malicious PDF documents, many of the resulting
+tools are based on manual analysis. To be mentioned are
+Didier Stevens who developed several practical tools, such
+as the PDF parser and PDFid. These tools are not only tools,
+but was the beginning of a structured way of looking at
+suspicious objects in PDF documents as well. To be credited
+as well is Paul Baccas in Sophos, which did considerable
+work on characterizing malicious contra benign PDF
+The paper will be doing research into the feature,
+javascript subvector of malicious PDF documents. To be able
+to determine an effective vector (in this experimental
+phase), it is essential that the dataset is filtered,
+meaning that the files must be malicious. As Kittilsen has
+done in regard to PDF documents, Al-Tharwa et ál2 has done
+interesting work to detect malicious javascript in browsers.
+## Background
+### A.1. The Feature Vector in Support of Digital Evidence
+Carrier and Spafford defined "digital evidence" as any
+digital data that contain reliable information that supports
+or refutes a hypothesis about the incident7. Formally, the
+investigation process consists of five parts and is
+specially crafted for maintaining evidence integrity, the
+order of volatility (OOV) and the chain of custody. This all
+leads up to the term forensic soudness.
+The investigation process consists of five phases. Note the
+identification and analysis phase.
+![Fig. 1: The investigation process. The investigation
+ process consists of five phases9. Note the identification
+ and analysis
+ phase](/images/2015/02/Theinvestigationprocess-e1380485641223.png)
+In this paper, forensic soudness is a notion previously
+defined10 as meaning: No alternation of source data has
+occured. Traditionally this means that every bit of data is
+copied and no data added. The previous paper stated two
+elementary questions:
+* Can one trust the host where the data is collected from?
+* Does the information correlate to other data?
+When it comes to malicious documents, they are typically
+collected in two places:
+1. In the security monitoring logging, the pre-event phase
+2. When an incident has occured and as part of the reaction to an
+ incident (the collection phase)
+Now, the ten thousand dollar question: When a malicious
+document gets executed on the computer, how is it possible
+to get indications that alteration of evidence has occured?
+The answer is potentially the first collection point, the
+pre-event logging.
+In many cases, especially considering targeted attacks, it
+is not possible to state an PDF document as malicious in the
+pre-event phase. The reason for this is often the way the
+threat agent craft his attack to evade the security
+mechanisms in the target using collected intelligence. Most
+systems in accordance to local legislation should then
+delete the content data. A proposition though is to store
+the feature vector.
+The reasoning behind storing a feature vector is quite
+simple: When storing hashes, object counts and the
+javascript subvector which we will return to later in the
+paper, it will be possible to indicate if the document
+features has changed. On the other side there is no
+identifiable data invading privacy.
+It is reasonable to argue that the measure of how similar
+one PDF document is to another, is also the measure of how
+forensically sound the evidence collected in a post-event
+phase is. How likely it is that the document aquired in the
+collection phase is the same as the one in the pre-phase is
+decided by the characteristics supplied by the feature
+vectors of both. Further, the feature-vector should be as
+rich and relevant as possible.
+![Fig. 2: Correlation by using the feature vector of the PDF
+ document. Illustration of a possible pre/post incident
+ scenario](/images/2015/02/Preandpost.png)
+### A.2. Identification as an Extension of Similarity
+The notion of similarity largely relates to the feature
+vector: How is it in large quantities of data possible to
+tell if the new PDF document carries similar characteristics
+like others of a larger dataset.
+In his work with semantic similarity and preserving hashing,
+M. Pittalis11 defined similarity from the Merriam-Webster
+> Similarity: The existance of comparable aspect between two
+> elements
+> – Merriam-Webster Dictionary
+The measure of similarity is important in regard to
+clustering or grouping the documents. When clustering
+datasets the procedure is usually in six steps, finding the
+similarity measure is step 2.
+1. Feature selection
+2. Proximity/similarity measure
+3. Clustering criterion
+4. Clustering algorithm
+5. Validation
+6. Interpretation
+In this paper the k-means unsupervised learning clustering
+algorithm was consideres. This simple algorithm groups the
+number n observations into k clusters22. Each observation
+relates to the cluster with the nearest mean.
+Now, as will be seen over the next two sections, work done
+in the subject is mostly missing out on giving a valid
+similarity measure when it comes to classifying PDF
+documents as anything other than malicious or benign. So, to
+be able to cluster the PDF documents the feature vector will
+need a revision.
+As Pittalis introduced the concept of similarity, it is
+important to define one more term: Identification. According
+to the American Heritage Dictionary, identification is:
+> Proof or Evidence of Identity.
+> — The American Heritage Dictionary
+In our context this means being able to identify a PDF
+document and attribute it to e.g. a certain type of botnet
+or perhaps more correct a coding or obfuscation
+technique. In an ideal state this will give an indication to
+which threat agent is behind the attack. This is something
+that has not been researched extensively in regard to PDF
+documents earlier.
+### C. The Portable Document Format
+When it comes to the feature vector of the portable document
+format (PDF), it is reasonable to have a look at how PDF
+documents are structured. The PDF consists of objects, each
+object is of a certain type. As much research has been done
+on the topic previously, the format itself will not be
+treated any further in this paper12.
+![A simplified illustration of the portable document format](/images/2015/02/ObjectdescriptionPDF-2.png)
+When considering malicious PDF documents, relevant
+statistics has shown the following distribution of resource
+**Known Malicious Datasets Objects** A table showing a
+number interesting and selected features in malicious seen
+against clean PDF documents. Baccas used two datasets where
+one indicated slightly different results.
+ Dataset Object Type Clean (%) Malicious (%)
+ The Shadowserver 100k PDF malicious dataset /JavaScript NA 97%
+ --
+ Paul Baccas' Sophos 130k malicious/benign dataset3 /JavaScript 2% 94%
+ /RichMedia 0% 0,26%
+ /FlateDecode 89% 77%
+ /Encrypt 0,91% 10,81%
+What can be seen of the table above is that when it comes to
+the distribution of objects in malicious files, most of them
+contains javascript. This makes it very hard to distinguish
+and find the similarity between the documents without
+considering a javascript subvector. The author would argue
+that this makes it a requirement for a javascript subvector
+to be included in the PDF feature vector to make it a
+valid. In previous work, where the aim has been to
+distinguish between malicious and benign, this has not been
+an issue.
+### D. Closing in on the Core: The PDF Javascript Feature Subvector
+Javascript is a client-side scripting language primarily
+offering greater interactivity with webpages. Specifically
+javascript is not a compiled language, weakly-typed4 and has
+first-class functions5. In form of rapid development, these
+features gives great advantages. In a security perspective
+this is problematic. The following states a Snort signature
+to detect a javascript "unescape"-obfuscation technique2(we
+will return to the concept of obfuscation later on):
+ alert tcp any any -> any any (msg:”Obfuscated unescape”; sid: 1337003; content:”replace”; pcre:”/u.{0,2}n.{0,2}e.{0,2}s.{0,2}c.{0,2}a.{0,2}p.{0,1}e’ ?.replace (/”;rev:4;)
+Traditionally javascript is integrated as a part of an
+browser. Seen from a security perspective, this opens for
+what is commonly known as client-side attacks. More
+formally: Javascript enables programmatic access to
+computational objects within a host environment. This is
+complicated as javascript comes in different flavors, making
+general parsing and evaluation complex6, as may be seen of
+the above signature. The flavors are often specific to the
+application. Today, most browsers are becoming more aligned
+due to the requirements of interoperability. Some
+applications, such as the widely deployed Adobe Reader has
+some extended functionality though, which we will be
+focusing on in this paper.
+Even though javascript may pose challenges to security, it
+is important to realize that this is due to
+complexity. Javascript (which is implemented through
+SpiderMonkey in Mozilla18-products and in Adobe Reader as
+well) builds on a standard named ECMA-262. The ECMA is an
+standardization-organ of Information and Communication
+Technology (ICT) and Consumer Electronics (CE)17. Thus,
+Javascript is built from the ECMAScript scripting language
+standard. To fully understand which functions is essential
+in regard to malicious Javascripts this paper will rely on
+the ECMAScript Language Specification19 combined with expert
+### E. Introducing Obfuscation
+Harawa et al.8 describes javascript obfuscation by six elements:
+* Identifier reassignment or randomization
+* Block randomization
+* White space and comment randomization
+* Strings encoding
+* String splitting
+* Integer obfuscation
+Further, Kittilsen1 documented a javascript feature vector
+which states the following functions as potentially
+malicious: [function, eval_length, max_string, stringcount,
+replace, substring, eval, fromCharCode]. Even though his
+confusion matrix shows good results, there are some problems
+when it comes to evaluating these as is: Such characters are
+usually obfuscated. The following is an example from sample
+ if((String+'').substr(1,4)==='unct'){e="".indexOf;}c='var _l1="4c206f5783eb9d;pnwAy()utio{.VsSg',h<+I}*/DkR%x-W[]mCj^?:LBKQYEUqFM';l='l';e=e()[((2+3)?'e'+'v':"")+"a"+l];s=[];a='pus'+'h';z=c's'+"ubstr" [1];sa [2];z=c's'+"ubstr" [3];sa [2];z=c['s'+"ubstr"] [...]e(s.join(""));}
+The above example tells an interesting story about the
+attackers awareness of complexity. In respect to Kittilsens
+javascript feature vector the above would yield the
+following result: [0,x,x,x,0,0,0,0] (considerable results on
+the second to fourth, plus one count if we are to shorten
+substring to substr), in other words the features are to be
+found in the embedded, obfuscated javascript, but not in
+clear text. When it comes to eval_length, max_string and
+string_count we will return to those later in the paper.
+Deobfuscated, the script would look like:
+ var _l1="[...]";_l3=app;_l4=new Array();function _l5(){var _l6=_l3.viewerVersion.toString();_l6=_l6.replace('.','');while(_l6.length&4)_l6l='0';return parsetnt(_l6,10);function _l7(_l8,_l9){while(_l8.length+2&_l9)_l8l=_l8;return _l8.substring(0,_l9I2);function _t0(_t1){_t1=unescape(_t1);rote}a*=_t1.length+2;da*/ote=unescape('Du9090');spray=_l7(da*/ote,0k2000Rrote}a*);lok%hee=_t1lspray;lok%hee=_l7(lok%hee,524098);for(i=0; i & 400; ill)_l4xi-=lok%hee.substr(0,lok%hee.lengthR1)lda*/ote;;function _t2(_t1,len){while(_t1.length&len)_t1l=_t1;return _t1.substring(0,len);function _t3(_t1){ret='';for(i=0;i&_t1.length;il=2){b=_t1.substr(i,2);c=parsetnt(b,16);retl=String.froW[har[ode(c);;return ret;function _]i1(_t1,_t4){_t5='';for(_t6=0;_t6&_t1.length;_t6ll){_l9=_t4.length;_t7=_t1.char[odeAt(_t6);_t8=_t4.char[odeAt(_t6D_l9);_t5l=String.froW[har[ode(_t7m_t8);;return _t5;function _t9(_t6){_]0=_t6.toString(16);_]1=_]0.length;_t5=(_]1D2)C'0'l_]0j_]0;return _t5;function _]2(_t1){_t5='';for(_t6=0;_t6&_t1.length;_t6l=2){_t5l='Du';_t5l=_t9(_t1.char[odeAt(_t6l1));_t5l=_t9(_t1.char[odeAt(_t6));return _t5;function _]3(){_]4=_l5();if(_]4&9000){_]5='oluAS]ggg*pu^4?:IIIIIwAAAA?AAAAAAAAAAAALAAAAAAAAfhaASiAgBA98Kt?:';_]6=_l1;_]7=_t3(_]6);else{_]5='*?lAS]iLhKp9fo?:IIIIIwAAAA?AAAAAAAAAAAALAAAAAAAABk[ASiAgBAIfK4?:';_]6=_l2;_]7=_t3(_]6);_]8='SQ*YA}ggAA??';_]9=_t2('LQE?',10984);_ll0='LLcAAAK}AAKAAAAwtAAAALK}AAKAAAA?AAAAAwK}AAKAAAA?AAAA?gK}AAKAAAA?AAAAKLKKAAKAAAAtAAAAEwKKAAKAAAAwtAAAQAK}AUwAAA[StAAAAAAAAAAU}A]IIIII';_ll1=_]8l_]9l_ll0l_]5;_ll2=_]i1(_]7,'');if(_ll2.lengthD2)_ll2l=unescape('D00');_ll3=_]2(_ll2);with({*j_ll3;)_t0(*);Ywe123.rawValue=_ll1;_]3();
+Which through the simple Python script javascript feature
+vector generator (appendice 1), yields:
+ ['function: 9', 'eval_length: x', 'max_string: x', 'stringcount: x', 'replace: 1', 'substring|substr: 4', 'eval: 0', 'fromCharCode: 0']
+Harawa et al.' 6 elements of javascript obfuscation is
+probably a better, or necessary supplemental approach to
+Kittilsens work.
+There is a notable difference between deobfuscation and
+detecting obfuscation techniques. The difference consists of
+the depth of insight one might gain in actually
+deobfuscating a javascript as it will reveal completely
+different code while the obfuscation routines may be based
+on a generic obfuscator routine used by several threat
+agents. This is much like the issue of packers in regard to
+This section has shown the difficulties of balancing
+deobfuscation for a more detailed coding style analysis
+against a less specific feature vector by using abstract
+obfuscation detection.
+## Extracting and Analysing a PDF Feature Vector
+### A. Deobfuscation - Emerging Intentions
+Usually the most pressing question when an incident
+involving a PDF document occur is: Who did it, and what's
+his intentions. This is also a consideration when further
+evolving the PDF feature vector. In the next figure is a
+model describing three groups of threat agents, where one
+usually stands out. Such as if a Stuxnet scale attack24
+involving a PDF document is perceived it will be associated
+with a cluster containing "group 1" entities.
+While Al-Tharwa et ál2 argues for no need for deobfuscation
+in regard to classification, deobfuscation is an important
+step in regard to finding a distinct feature vector. The
+issue is that in most situations it isn't good enough to
+tell if the documents is malicious, but also in addition to
+who, what, where and how it was created. In regard to being
+defined as valid digital evidence a rich feature vector (in
+addition to the network on-the-fly hash-sum) is part of
+telling. The latter also makes itself relevant when it comes
+to large quantities of data, where an analyst is not capable
+of manually analyzing and identifying hundreds to tens of
+thousands of PDF documents each day.
+![Fig. 4: The threat agent modelA model describing three
+ groups of attackers. These are necessary to filter and
+ detect in the collection
+ phase](/images/2015/02/threat-agent-model.png)
+### B. Technical Problems During Deobfuscation
+Normally most javascript engines, such as Mozillas
+Spidermonkey15, Google V816 and others, tend to be
+javascript libraries for browsers and miss some basic
+functionality in regard to Adobe Reader which is the most
+used PDF reader. These engines is most often used for
+dynamic analysis of Javascripts and is a prerequiste when it
+comes to being able to completely deobfuscate javascripts.
+To prove the concepts of this article a static Python
+feature vector generator engine based on a rewritten version
+of the Jsunpack-n14project is used. The application used in
+the paper is providing a vector based interpretation of the
+static script, meaningn it is not run it dynamically.
+Reliably detecting malicious PDF documents is a challenge
+due to the obfuscation routines often used. This makes it
+necessary to perform some kind of deobfuscation to reveal
+more functionality. Even if one managed to deobfuscate the
+script one time, there may be several rounds more before it
+is in clear text. This was a challenge not solvable in the
+scope of this article.
+Due to parsing errors under half of the Shadowserver 100k
+dataset was processed by the custom Jsunpack-n module.
+### C. Introducing Two Techniques: Feature Vector Inversion and Outer Loop Obfuscation Variable Computation
+As have been very well documented so far in the paper it is
+more or less impossible to completely automate an
+deobfuscation process of the PDF format. Obfuscation leaves
+many distinct characteristics though, so the threat agent on
+the other hand must be careful to not trigger anomaly
+alarms. There is a balance. This part of the article
+introduces two novel techniques proposed applied to the
+javascript subvector to improvie its reliability.
+#### C.1. Outer Loop Obfuscation Variable Computation (OLOVC)
+When the threat agent implements obfuscation, one of his
+weaknesses is being detected using obfuscation. When it
+comes to PDF documents using javascripts alone is a
+trigger. Now, the threat agent is probably using every trick
+in the book, meaning the 6 elements of javascripts
+obfuscation8. The job of an analyst in such a matter will be
+to predict new obfuscation attempts and implement anomaly
+alerts using the extended PDF feature vector.
+Throughout this paper we will name this technique "Outer
+Loop Obfuscation Variable Computation". The term "outer
+loop" most often refer to round zero or the first of the
+deobfuscation routines. Variable computation is as its name
+states, a matter of computing the original javascript
+variable. As we have seen this may be done by either
+deobfuscating the script as a whole including its
+near-impossible-for-automation complexity, or use the
+original obfuscated data. We will have a further look at the
+latter option.
+Take for instance this excerpt from the "Introducing Obfuscation"-section:
+ z=c['s'+"ubstr"](0,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](2,1);s[a](z);z=c['s'+"ubstr"](3,1);s[a](z);z=c['s'+"ubstr"](4,1);s[a](z);z=c['s'+"ubstr"](5,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](7,1);s[a](z);z=c['s'+"ubstr"](8,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](10,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](13,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](14,1);s[a](z);z=c['s'+"ubstr"](12,1);[...](20,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](18,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](13,1);s[a](z);z=c['s'+"ubstr"](19,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](14,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);
+Harawa ét al defined the above obfuscation technique as
+"string splitting" (as seen in the section "Introducing
+obfuscation"). The following two obfuscation-extraction
+regular expressions, is previously stated in the authors
+Bachelors thesis2:
+ e.{0,2}v.{0,2}a.{0,2}l.{0,1}
+ u.{0,2}n.{0,2}e.{0,2}s.{0,2}c.{0,2}a.{0,2}p.{0,1}e
+Keep the two above statements and the previous code excerpt
+in mind. When breaking down the above expressions we
+introduce one more regular expression:
+ s.{0,4}u.{0,4}b.{0,4}s.{0,4}t.{0,4}r.{0,4}
+While searching for "substr" in plain text in the plain-text
+will certainly fail, the above expression will match e.g.:
+ 's'+"ubstr"
+Recall Kittilsens javascript feature vector: ``[function,
+eval_length, max_string, stringcount, replace, substring,
+eval, fromCharCode]``. If extended by the above techniques,
+the results is somewhat different.
+Without string splitting detection:
+ ['function: 9', 'eval_length: x', 'max_string: 10849', 'stringcount: 1', 'replace: 1', 'substring|substr: 4', 'eval: 0', 'fromCharCode: 0']
+With outer loop obfuscation variable computation:
+ ['function: 0', 'eval_length: x', 'max_string: 67', 'stringcount: 2', 'replace: 0', 'substring: 0', 'substr: 3663', 'eval: 1', 'fromCharCode: 0']
+Additionally, rewriting and extending Kittilsens feature
+vector by several other typically suspicious functions
+should give preferrable results: ``[max_string, stringcount,
+function, replace, substring, substr, eval, fromCharCode,
+indexof, push, unescape, split, join, sort, length,
+This makes the following results in two random, but related, samples:
+ [SHA256:5a61a0d5b0edecfb58952572addc06f2de60fcb99a21988394926ced4bbc8d1b]:{'function': 0, 'sort': 0, 'unescape': 0, 'indexof': 0, 'max_string': 10849, 'stringcount': 2, 'replace': 0, 'substring': 0, 'substr': 1, 'length': 1, 'split': 2, 'eval': 0, 'push': 0, 'join': 1, 'concat': 0, 'fromCharCode': 0}
+ [SHA256:d3874cf113fa6b43e7f6e2c438bd500edea5cae7901e2bf921b9d0d2bf081201]:{'function': 0, 'sort': 0, 'unescape': 0, 'indexof': 0, 'max_string': 67, 'stringcount': 1, 'replace': 0, 'substring': 0, 'substr': 3663, 'length': 0, 'split': 0, 'eval': 0, 'push': 1, 'join': 1, 'concat': 0, 'fromCharCode': 0}
+It may perhaps not need a comment, but in the above results
+we see that there are two types of elements in the feature
+vector that stands out: max_string and two of the suspicious
+Summarized the "Outer Loop Obfuscation Variable Computation"
+may be used to, at least partially, defeat the malware
+authors obfuscation attempts. By running the somewhat
+complex regular expressions with known malicious obfuscation
+routines, the implementation result of the 100.000 PDF
+dataset may be seen in the following table: Dataset
+generalization by "outer loop obfuscation variable
+computation" Dataset aggregated by counting javascript
+variables and functions, OLOVC applied (due to errors in the
+jsunpack-n the total number of entities calculated is
+ Word Count
+ function 651
+ sort 7579
+ unescape 4
+ toLowerCase 1
+ indexof 8
+ max_string 42346
+ stringcount 41979
+ replace 70
+ substring 91
+ replace 70
+ substring 91
+ substr 38952
+ length 1512
+ split 9621
+ eval 77
+ push 260
+ join 91
+ inverse_vector 41423
+ concat 86
+ fromCharCode 45
+By the counts in the above table it is shown that the
+selected feature vector has several very interesting
+features. On a sidenote: Even though some features has a
+larger quantity than others it should be mentioned that this
+is not necessarily the measure of how good that feature is,
+such is especially the case with the inverse vector as we
+will be more familiar with in the next section. Also, as
+previously mentioned it is interesting to see the
+composition of multiple features to determine the origin of
+the script (or the script style if you'd like). The
+aggregation script is attached in appendice 2.
+The "Outer Loop Obfuscation Variable Computation" will
+require a notable amount of computational resources in
+high-quantity networks due to the high workload. In a way
+this is unavoidable since the threat agents objective of
+running client-side scripts is to stress the resources of
+such systems.
+![Fig. 5: Illustration of Computational Complexity. The illustration shows the computational load on a network sensor in regard to different obfuscation techniques](/images/2015/02/Skjermbilde-2012-05-08-kl--20-43-04.png)
+### C.2. Feature Vector Inversion
+Threat agents go a long way in evading detection
+algorithms. The following thought is derived from a common
+misconception in database security:
+> A group of ten persons which names are not to be revealed
+ is listed amongst a couple of thousands, in an
+ organizations LDAP directory. The group, let us name it X,
+ is not to be revealed and is therefore not named in the
+ department field.
+While the public may not search and filter directly on the
+department name, being X, an indirect search would be
+succesful to reveal the group due to the ten persons being
+the only ones not associated with a department.
+The concept of searching indirectly may be applied to
+evaluating javascripts in PDF documents as well. We might
+start off with some of the expected characters found in
+benign javascript documents:
+ {'viewerVersion':1,'getPrintParams':1,'printd':1,'var':10,'getPageNthWord':1,'annot':2,'numPages':1,'new':3}
+The above which is found by expert knowledge as the probable
+used variables and functions in a benign javascript or other
+object. Much of these functions is used in interactive PDF
+documents, e.g. providing print buttons,
+A weight is added to each cleartext function/variable. After
+counting the words in the document a summarized variable
+named the inverted_feature_vector gives an integer. The
+higher the integer, the higher the probability of the
+javascript being benign.
+The inversed feature vector may be used as a signature and a
+whitelist indication database may be built of datasets. In
+the 100k malicious dataset the statistics showed that out of
+42475, 41423 had more than one occurence of a known benign
+variable. This might seem like a less good feature, but the
+quantity is not the issue here, it is the weight of each
+variable. So: One may say that the higher the inverse vector
+is, the more likely it is that the PDF or javascript is
+benign. To clarify, next table shows variables fragmented by
+weight: Inverse vector separated by interval, the
+**Shadowserver 100k dataset** _The table shows that most
+malicious PDF files in the 100k Shadowserver dataset
+contains low-weighted scores when it comes to the inverted
+vector as a measure of how benign the scripts are._
+ Weight interval Instances Instance percentage
+ <10 15232 35,6%
+ 20<>9 26852 62,8%
+ 30<>19 136 ~0%
+ 40<>29 148 ~0%
+ 50<>39 87 ~0%
+ 60<>49 28 ~0%
+ >60 253 ~0%
+ Total 42736 -
+The inversion vector may as well be seen as a measure of the
+likeliness that the script is obfuscated. A quick look at
+the table shows that the characteristics of obfuscation is
+found in most PDF documents in the Shadowserver 100k
+Even though this part of the vector should be seen as an
+indication, analysts should be aware that threat agents may
+adapt to the detection technique and insert clear text
+variables such as the ones listed above in addition to their
+malicious javascripts. This latter would function as a
+primitive feature vector inversion jammer. In other words it
+should be seen in context with the other items of the
+javascript feature vector as well. Further, the concept
+should be further evolved to avoid such evasion. One
+technique to segment the code before analyzing it (giving
+each code segment a score, finally generating a overall
+probability score), making it more difficult for the threat
+agent to utilize noise in his obfuscation.
+### D. Clustering
+Experience shows that in practically oriented environments
+security analysis is, at least partially, done in a manual
+manner. This saying that the detection is based on
+indicators or anomalies and the analysis of the detection
+results is performed manually by an analyst. Though this may
+possibly be the approach resulting in least false positives
+it is overwhelming in regard to analysis of all potentially
+PDF documents in a larger organization. The 100k PDF dataset
+used in this paper is a evidence of such. So, how is it
+possible to automatically detect the interesting parts of
+the 100k PDF dataset? This question leads to the concept of
+data mining.
+The definition of data mining is the transformation of data
+to "meaningful patterns and rules".
+Michael Abernethy at IBM developerWorks20 covers data mining quite extensively.
+#### D.1. A Narrow Experiment and Results
+In this paper the goal is to achieve an view of the dataset
+in a way that is named "undirected" data mining: Trying to
+find patterns or rules in existing data. This is achieved
+through the feature vector previously presented.
+Up until now this paper has discussed how to generate an
+satisfactionary feature vector and what makes the measure of
+similarity. Let us do an experiment using WEKA (Waikato
+Environment for Knowledge Analysis) for analyzing our
+feature vector.
+Appendice 3 describes the ARFF format found from our feature
+vector and two of the previously presented feature vectors
+and a random selection of 2587 parseable PDF-documents from
+the dataset.
+In this experiement the feature vector were produced of 200
+random samples from the 100k dataset. Interesting in that
+regard is that the subdataset loaded from originally
+contained 6214 samples, while our application only handled
+the decoding of under half. The feature vector was extracted
+in a CSV format, converted by the following WEKA Java class
+and loaded in WEKA:
+ java -classpath /Applications/weka-3-6-6.app/Contents/Resources/Java/weka.jar weka.core.converters.CSVLoader dataset.csv
+In the WEKA preprocessing, the results may be visualized:
+![Fig. 6: Results 1; PDF Feature Vector DistributionA model
+ showing the PDF feature vector object distribution using
+ the 2587 parsable PDF
+ documents](/images/2015/02/Skjermbilde-2012-05-16-kl--13-17-20.png)
+### D.2. The complete dataset
+Next loading the complete feature vector dataset consisting
+of 42736 entities showed interesting results when
+![Fig. 7: Stringcount vs anomalies in the inverse
+ vector. Stringcount vs anomalies in the
+ inverse_vector. Using k-means algorithm and k=5. Medium
+ Jitter to emphasize the
+ clusters](/images/2015/02/Skjermbilde-2012-06-27-kl--11-40-19.png)
+The cluster process above also enables the possibility to
+look at the anomalies where the inverse_vector is high. For
+instance 9724 (the highest one in the Y-axis) the
+inverse_vector is 21510 which is a very clear anomaly
+compared to the rest of the clusters (the distance is
+far). This should encourage a closer look at the file based
+on the hash.
+The Shadowserver 100k ARFF dataset will be further evolved and may be found at the project GitHub page25.
+### E. Logging and Interpreting Errors
+Again and again while analyzing the 100k dataset the
+interpreter went on parsing errors. Bad code one may say,
+but a fact is that the threat agents are adapting their code
+to evading known tools and frameworks. An example of this is
+a recent bug21 in Stevens PDF parser where empty PDF objects
+in fact created an exception in the application.
+So, what does this have to do with this paper? Creative
+threat agents can never be avoided, creating malicious code
+that avoids the detection routines. This makes an important
+point, being that the application implemented should be
+using strict deobfuscation and interpretation routines. When
+an error occurs, which will happen sooner or later, the file
+should be traceable and manually analyzed. This in turn
+should lead to an adaption of the application. Where the
+routines fails will also be a characteristic of the threat
+agent: What part of the detection routines does he try to
+evade? E.g. in the 100k dataset an error on the
+ascii85-filter occurred. The parsing error made the
+parser-module not to output a feature vector, and were
+detected by error monitoring in log files.
+## Discussion and Conclusions
+In regard to being used standalone as evidence the feature
+vector will have its limitations, especially since its hard
+to connect it to an event it should be considered
+The PDF and ECMA standard are complex and difficult to
+interpret, especially when it comes to automation. As has
+been shown in this article a really hard problem is
+dynamically and generically executing javascripts for
+deobfuscation. This is also shown just in the Adobe Reader,
+where e.g. Adobe Reader X uses Spidermonkey 1.8, while
+previous more prevalent versions use version 1.7 of
+Spidermonkey. This often resulted in parsing errors, and
+again it will potentially cause a larger error rate in the
+next generation intrusion detection systems.
+It has been proved that a static analysis through a
+Jsunpack-n modification recovers good enough round-zero
+data, from a little less than half of the Shadowserver 100k
+dataset, to generate a characteristic of each file. The
+results were somewhat disappointing in regard to the
+extensive parsing errors. Parsing optimalization and error
+correction making the script more robust and reliable should
+be covered in a separate report. Despite the latter a good
+foundation and enough data were given to give a clue for
+what to expect from the extended PDF feature vector. Also,
+the inverse vector with its weighting gives a individual
+score to each document, making it exceptionally promising
+for further research.
+In regard to OLOVC a certain enhancement would be to combine
+it with the work of Franke' and Petrovic' "Improving the
+efficiency of digital forensic search by means of contrained
+edit distance". Their concept seems quite promising and
+might provide valuable input to OLOVC.
+The dataset used in this article may contain certain flaws
+in its scientific foundation. No dataset flaws, but
+indications that some data origins from the same source, has
+been seen throughout this article. The reason is most
+probably that the dataset was collected over three
+continuous days. Linked to the behaviour of malware it is
+known that certain malware such as drive-by attacks has
+peaks in its spread as a function of time. It is therefore
+natural to assume that there are larger occurences of PDF
+documents originating from the same threat agent. On the
+other side, in further research, this should be a measure of
+the effectiveness of algorithms ability to group the data.
+The Shadowserver 100k dataset only contains distinct
+files. It would be interesting to recollect a similar
+dataset with non-distinct hash-entries, and to cluster it by
+fuzzy hashing as well.
+Even though clustering is mentioned in the last part of this
+article, further extensive research should be done to
+completely explore the potential of using the current
+feature vector. In other words the scope of the article
+permitted for a manual selection of a feature vector and a
+more or less defined measure of similarity though the
+extended PDF feature vector.
+The project has a maintained GitHub page as introduced in
+the last section. This page should encourage further
+development into the extended PDF feature vector.
+If you'd like please have a look at the GuC Testimon Forensic Laboratory [21].
+[1] GuC Testimon Forensic Laboratory: https://sites.google.com/site/testimonlab/
diff --git a/data/osquery.md b/data/osquery.md
new file mode 100644
index 0000000..bc53503
--- /dev/null
+++ b/data/osquery.md
@@ -0,0 +1,211 @@
+In another post I wrote about how telemetry is a challenge [1] of
+a changing and more diverse and modern landscape. Recently I have
+reviewed some device inventory and endpoint detection tools that
+will add to the solution. In the future I will get back to my view
+on Mozilla InvestiGator (MIG) [2], but this post will focus on a
+telemetry collection tool that I have grown fond of: osquery [3].
+osquery was originally developed by Facebook for the purpose of
+> Maintaining real-time insight into the current state of your infrastructure[...]
+With osquery data is abstracted, in the operating system in which
+the agent runs, to a SQL-based interface. It contains a
+near-infinite amount of available data, which is perfect to a
+network defender. osquery can even parse native sqlite-databases,
+which there are lots of in macOS. It also works in a distributed
+mode like GRR and MiG. In practical terms this means that queries
+are distributed. On the other hand, events can be streamed as well
+when considering operational security.
+![Example of the hardware_events table when plugging in and then detaching a Yubikey](/static/img/data/osquery_hardware_events.png)
+Since 2014 osquery has been open sourced and now has a large
+community developing about every aspect of the tool. According to
+the briefs that's online several major institutions, including
+Facebook, now uses osquery in service networks.
+osquery is cross-platform, and now supports: Linux, FreeBSD,
+Windows and macOS. That is also some of what separates it from its
+alternatives, like sysmon.
+Posts about osquery that you should review before moving on:
+* Doug Wilson's excellent presentation on FIRST 2018
+ (security-usage focused) [5]
+* Managing osquery with Kolide (an osquery tls server) [6]
+* Another post on applying osquery for security [7]
+* Palantir on osquery [8]
+So that was a couple of links to get you started. The next section shows you how to quickly get a lab environment up and running.
+## Setup and Configuration
+### Prerequisites
+There's only two things that you need setup for the rest of this
+article if you are on macOS, which can both be easily installed
+using Homebrew [9]:
+ brew install go yarn
+Also you need to configure your Go-path, which can basically be:
+ echo "export GOPATH=$HOME/go" >> ~/.bash_profile
+### Server Setup
+Setup Docker image of Kolide Fleet [10]:
+ mkdir -p $GOPATH/src/github.com/kolide
+ cd $GOPATH/src/github.com/kolide
+ git clone git@github.com:kolide/fleet.git
+ cd fleet
+ make deps && make generate && make
+ docker-compose up
+Populate the database:
+ ./build/fleet prepare db
+You are now ready to boot up the web UI and API server:
+ ./build/fleet serve --auth_jwt_key=3zqHl2cPa0tMmaCa9vPSEq6dcwN7oLbP
+Get enrollment secret and certificate from the Kolide UI at
+``https://localhost:8080`` after doing the registration process.
+![Kolide enrollment](/static/img/data/kolide-enrollment.png)
+### Client Setup
+Make the API-token (enrollment secret) persistent at the
+ export {enrollment-secret} > /etc/osquery/enrollment.secret
+Define flags file in ``/private/var/osquery/osquery.flags``. This
+one the client uses to apply the centralised tls logging method,
+which is the API Kolide has implemented. It is also certificate
+pinned, so all is good.
+ --enroll_secret_path=/etc/osquery/enrollment.secret
+ --tls_server_certs=/etc/osquery/kolide.crt
+ --tls_hostname=localhost:8080
+ --host_identifier=uuid
+ --enroll_tls_endpoint=/api/v1/osquery/enroll
+ --config_plugin=tls
+ --config_tls_endpoint=/api/v1/osquery/config
+ --config_tls_refresh=10
+ --disable_distributed=false
+ --distributed_plugin=tls
+ --distributed_interval=10
+ --distributed_tls_max_attempts=3
+ --distributed_tls_read_endpoint=/api/v1/osquery/distributed/read
+ --distributed_tls_write_endpoint=/api/v1/osquery/distributed/write
+ --logger_plugin=tls
+ --logger_tls_endpoint=/api/v1/osquery/log
+ --logger_tls_period=10
+You can start the osquery daemon on the client by using the
+following command. At this point you should start thinking about
+packaging, which is detailed in the osquery docs [11].
+ /usr/local/bin/osqueryd --disable_events=false --flagfile=/private/var/osquery/osquery.flags
+osquery also has an interactive mode if you would like to test the
+local instance, based on a local configuration file:
+ sudo osqueryi --disable_events=false --config_path=/etc/osquery/osquery.conf --config_path=/etc/osquery/osquery.conf
+To make the client persistent on macOS, use the following
+documentation from osquery [12].
+### Managing the Kolide Configuration
+For this part I found what worked best was using the Kolide CLI
+client [13]:
+ ./build/fleetctl config set --address https://localhost:8080
+ ./build/fleetctl login
+ ./build/fleetctl apply -f ./options.yaml
+The ``options.yaml`` I used for testing was the following. This
+setup also involves setting up the osquery File Integrity
+Monitoring (FIM) [14], which I wasn't able to get working by the
+patching curl command [15] in the docs. The config monitors
+changes in files under ``/etc`` and a test directory at
+ apiVersion: v1
+ kind: options
+ spec:
+ config:
+ decorators:
+ load:
+ - SELECT uuid AS host_uuid FROM system_info;
+ - SELECT hostname AS hostname FROM system_info;
+ file_paths:
+ etc:
+ - /etc/%%
+ test:
+ - /var/tmp/filetest/%%
+ options:
+ disable_distributed: false
+ distributed_interval: 10
+ distributed_plugin: tls
+ distributed_tls_max_attempts: 3
+ distributed_tls_read_endpoint: /api/v1/osquery/distributed/read
+ distributed_tls_write_endpoint: /api/v1/osquery/distributed/write
+ logger_plugin: tls
+ logger_tls_endpoint: /api/v1/osquery/log
+ logger_tls_period: 10
+ pack_delimiter: /
+ overrides: {}
+## Next Steps
+Through this article we've reviewed some of the basic capabilities
+of osquery and also had a compact view on a lab-setup
+demonstrating centralised logging, to Kolide, using the tls API of
+A couple of things that I would have liked to see was support for
+OpenBSD [16], Android and Ios [17].
+The local setup obviously does not scale beyond your own
+computer. I briefly toyed with the idea that this would be a
+perfect fit for ingesting into a Hadoop environment, and not
+surprising there's a nice starting point over at the Hortonworks
+forums [18].
+There's a lot of open source information on osquery. I also found
+the Uptycs blog useful [19].
+[1] https://secdiary.com/2018-02-25-telemetry.html
+[2] https://mig.mozilla.org
+[3] https://osquery.io
+[4] https://code.fb.com/security/introducing-osquery/
+[7] https://medium.com/@clong/osquery-for-security-part-2-2e03de4d3721
+[8] https://github.com/palantir/osquery-configuration
+[9] https://brew.sh
+[11] https://osquery.readthedocs.io/en/2.1.1/installation/custom-packages/
+[12] https://osquery.readthedocs.io/en/stable/installation/install-osx/
+[16] https://github.com/facebook/osquery/issues/4703
+[17] https://github.com/facebook/osquery/issues/2815
+[19] https://www.uptycs.com/blog
diff --git a/data/privacy-report-2014.md b/data/privacy-report-2014.md
new file mode 100644
index 0000000..95e783c
--- /dev/null
+++ b/data/privacy-report-2014.md
@@ -0,0 +1,69 @@
+I read in a Norwegian news publication yesterday that [more
+than 50% of Norwegians doesn't care about Internet and
+network surveillance [1]. In the original 60 page report
+(survey and report ordered by the Norwegian Data Protection
+Authority), named Privacy 2014 - The Current State and
+Trends ("Personvern 2014 - Tilstand og Trender"), 46% of the
+1501 participants state that they've gotten more concerned
+with privacy over the last 2-3 years.
+The follow up question that the survey presented was "How
+much do you care about privacy?". In the 1997 version of the
+survey 77% said they were "pretty engaged or very engaged"
+in privacy, while in 2013 there's an increase to 87%. Not as
+bad as the news publication wants it to be in other words. I
+guess what is referred to is mentioned in the section "The
+Chilling Effects in Norway", where more than half of the
+respondents states they haven't changed online behaviour
+after the revelations of the American surveillance
+methodologies. I think this correlates to the next section
+(below). Also, more than 45% state that they would have
+continued as normal if Norway were to start a massive
+surveillance campaign in collaboration with foreign
+I read one section where asked "how much control of your own
+situation do you feel you have?". More than half of the
+respondents answered themselves, and 33% the government. The
+latter is pretty amazing in my opinion. It's obviously
+yourself that is responsible for your own situation. Seen in
+regard to that more than 78% wouldn't pay 20 bucks a month
+for privacy in online services it's even better.
+The report also have it's own section dedicated to the
+Snowden revelations. Pretty interesting that 53% responded
+that they didn't care about the surveillance, it is
+unproblematic or that it's just plain
+necessary. Interesting, considering that it's another nation
+state than Norway we're talking about here. I could have
+understood it if it was our own government, but another
+country? Anyways, that's the facts.
+One question that I perhaps miss in the survey is "have you
+done anything to protect your online presence from
+surveillance?". One of the alternatives could for instance
+be: "I use end-to-end encryption, such as GPG". It was
+obviously not that technical a survey, and I can respect
+that - but at the same time I see that's where it have to
+end at some point. Thinking if I was employed in another
+type of occupation: I think people would have continued as
+normal if we get a mass-surveillance state because you get
+to a point of exhaustion due to the complexity of the
+technology and lack of knowledge on how to actually protect
+yourself. I also think that the hypothetical question of
+awareness of a mass-surveillance state would have had more
+chilling effects than people actually respond. The question
+actually reminds me of the Iron Curtain period, thinking
+that you are always surveilled.
+The survey can be read in full here [2] (Norwegian), and I
+think it's pretty good and thorough on the current state of
+privacy in Norway. The survey was delivered by Opinion
+Perduco. The 1997 survey was delivered by Statistics Norway.
+[1] http://translate.google.com/translate?sl=auto&tl=en&js=n&prev=_t&hl=en&ie=UTF-8&u=http%3A%2F%2Fwww.digi.no%2F926712%2Fhalvparten-gir-blaffen
+[2] https://www.datatilsynet.no/Nyheter/2014/Personvern-2014-tilstand-og-trender-/
diff --git a/data/relayd-multidomain.md b/data/relayd-multidomain.md
new file mode 100644
index 0000000..0cfcb02
--- /dev/null
+++ b/data/relayd-multidomain.md
@@ -0,0 +1,134 @@
+While running a relayd service for a multi-domain instance
+recently I quickly came into an issue with relayd routing.
+relayd(8) is the relay daemon in OpenBSD.
+I run two local services that I front with relayd:
+* service A
+* service B
+These two I define in relayd.conf(5):
+ ext_addr=""
+ honk_port="31337"
+ inks_port="31338"
+ table { }
+ table { }
+To make sure relayd logs sufficiently for traceability I apply the
+following options:
+ log state changes
+ log connection
+The next part of my relayd.conf is creating a configuration for
+the relay service ("protocols are templates defining settings and rules for relays"):
+ http protocol https { }
+For the service definition I make sure to add the remote address
+and local address:
+ match request header append "X-Forwarded-For" value "$REMOTE_ADDR"
+ match request header append "X-Forwarded-By" \
+A further important logging configuration comes next, and I make
+sure my relay logs the host, X-Forwarded-For, User-Agent,
+Referer and url:
+ match header log "Host"
+ match header log "X-Forwarded-For"
+ match header log "User-Agent"
+ match header log "Referer"
+ match url log
+For performance [1]:
+ tcp { nodelay, sack, socket buffer 65536, backlog 100 }
+Next I disable vulnerable ciphers:
+ tls no tlsv1.0
+ tls no tlsv1.1
+ tls tlsv1.2
+Sadly tlsv1.3 is still in -current, so we will have to wait for
+I configure keys like follows:
+ tls ca cert "/etc/ssl/cert.pem"
+ tls keypair serviceA.domain
+ tls keypair serviceB.domain
+Finally we use the tables defined initially to route traffic to
+the right internal service:
+ match request header "Host" value "serviceA.domain" forward to
+ match request header "Host" value "serviceB.domain" forward to
+And that is it for the service definition.
+In addition we define the relay ("relays will forward traffic
+between a client and a target server") like follows. The "protocol
+https" is the junction between the two parts of the config.
+ relay https_relay {
+ listen on $ext_addr port https tls
+ protocol https
+ forward to port $honk_port check tcp
+ forward to port $inks_port check tcp
+ }
+The whole config:
+table { }
+table { }
+log state changes
+log connection
+http protocol https {
+ match request header append "X-Forwarded-For" value "$REMOTE_ADDR"
+ match request header append "X-Forwarded-By" \
+ match request header set "Connection" value "close"
+ match header log "Host"
+ match header log "X-Forwarded-For"
+ match header log "User-Agent"
+ match header log "Referer"
+ match url log
+ tcp { nodelay, socket buffer 65536, backlog 100 }
+ tls no tlsv1.0
+ tls no tlsv1.1
+ tls tlsv1.2
+ tls ca cert "/etc/ssl/cert.pem"
+ tls keypair cybsec.network
+ tls keypair inks.cybsec.network
+ match request header "Host" value "cybsec.network" forward to
+ match request header "Host" value "inks.cybsec.network" forward to
+relay https_relay {
+ listen on $ext_addr port https tls
+ protocol https
+ forward to port $honk_port check tcp
+ forward to port $inks_port check tcp
+[1] https://calomel.org/relayd.html
diff --git a/data/remote-forensics.md b/data/remote-forensics.md
new file mode 100644
index 0000000..62202a9
--- /dev/null
+++ b/data/remote-forensics.md
@@ -0,0 +1,159 @@
+Like everything else in information security, forensics is
+constantly evolving. One matter of special interest for
+practitioners is doing forensics on remote computers, not that
+it's entirely new.
+The use-case is self-explanatory to those working in the field,
+but for the beginners I'll give a brief introduction.
+When you get a case on your desk and it lights up as something
+interesting, what do you do? Probably your first step is searching
+for known malicious indicators in network logs. Finding something
+interesting on some of the clients, let's say ten in this case,
+you decide to put some more effort into explaining the nature of
+the activity. None of the clients is nearby, multiple of them are
+even on locations with 1Mbps upload speeds.
+The next phase would probably be a search in open sources, perhaps
+turning out in support of something fishy going on. Now you'd like
+to examine some of the client logs for known hashes and strings
+you found, and the traditional way to go is acquiring disk and
+memory images physically. Or is it? That would have easily taken
+weeks for ten clients. In this case you are lucky and you have a
+tool for performing remote forensics at hand. The tool was a major
+roll-out for your organization after a larger breach.
+What's new in remote forensics is that the tools begin to get more
+mature, and by that I would like to introduce two products of
+which I find most relevant to the purpose:
+* Google Rapid Response (GRR) [1]
+* Mandiant for Incident Response (MIR) [2]
+Actually I haven't put the latter option to the test (MIR supports
+OpenIOC which is an advantage) - but I have chosen to take GRR
+for a spin for some time now. There are also other tools which may
+be of interest to you such as Sourcefire FireAmp which I've heard
+performs well for end-point-protection. I've chosen to leave that
+out this presentation since this is about a different
+concept. Surprisingly the following will use GRR as a basis.
+For this post there are two prerequisites for you to follow in
+which I highly recommend to get the feel with GRR:
+* Setup a GRR server [3]. In this post I've used the current beta
+ 3.0-2, running all services on the same machine, including the
+ web server and client roll-in interface. There is one install
+ script for the beloved Ubuntu here, but I couldn't get it easily
+ working on other systems. One exception is Debian which only
+ needed minor changes. If you have difficulties with the latter,
+ please give me a heads-up.
+* Sacrifice one client (it won't brick a production system as far
+ as I can tell either though) to be monitored. You will find
+ binaries after packing the clients in the GRR Server setup. See
+ the screenshot below for details. The client will automatically
+ report in to the server.
+You can find the binaries by browsing from the home screen in the
+GRR web GUI. Download and install the one of choice.
+A word warning before you read the rest of this post: The GRR
+website ~~is~~ was a little messy and not entirely intuitive. I
+found, after a lot of searching, that the best way to go about it
+is reading the code usage examples in the web GUI, especially when
+it comes to what Google named flows. Flows are little plugins in
+GRR that may for instance help you task GRR to fetch a file on a
+specific path.
+Notice the call spec. This can be transferred directly to the
+iPython console. Before I started off I watched a couple of
+presentations that Google have delivered at LISA. I think you
+should too if you'd like to see where GRR is going and why it came
+to be. The one here gives a thorough introduction on how Google
+makes sure they are able to respond to breaches in their
+infrastructure [4].
+I would also like to recommend an presentation by Greg Castle on
+BlackHat for reference [5]. For usage and examples Marley Jaffe
+at Champlain College have put up a great paper. Have a look at the
+exercises at the end of it.
+What is good with GRR is that it supports the most relevant
+platforms: Linux, Windows and OS X. This is also fully supported
+platforms at Google, so expect development to have a practical and
+long-term perspective.
+While GRR is relevant, it is also fully open source, and
+extensible. It's written in Python with all the niceness that
+comes with it. GRR have direct memory access by custom built
+drivers. You will find support for Volatility in there. Well they
+forked it into a new project named Rekall which is more suited for
+scale. Anyways it provides support for plugins such as Yara.
+If you are like me and got introduced to forensics through
+academia, you will like that GRR builds on Sleuthkit through pytsk
+for disk forensics (actually you may choose what layer you'd like
+to stay on). When you've retrieved an item, I just love that it
+gets placed in a virtual file system in GRR with complete
+The virtual filesystem where all the stuff you've retrieved or
+queried the client about is stored with versioning for you
+pleasure. In addition to having a way-to-go console application
+GRR provides a good web GUI which provides an intuitive way of
+browsing about everything you can do in the console. I think the
+console is where Google would like you to live though.
+An so I ended up on the grr_console which is a purpose-build
+iPython shell, writing scripts for doing what I needed it to
+do. Remember that call spec that I mentioned initially, here is
+where it gets into play. Below you see an example using the
+GetFile call spec (notice that the pathspec in the flow statement
+says OS, this might as well have been ``REGISTRY`` or ``TSK``):
+ token = access_control.ACLToken(username="someone", reason="Why")
+ flows=[]
+ path="/home/someone/nohup.out"
+ for client in SearchClients('host:Webserver'):
+ id=client[0].client_id
+ o=flow.GRRFlow.StartFlow(client_id=str(id),
+ flow_name="GetFile", pathspec=rdfvalue.PathSpec(path=path, pathtype=rdfvalue.PathSpec.PathType.OS))
+ flows.append(o)
+ files=[]
+ while len(flows)>0:
+ for o in flows:
+ f=aff4.FACTORY.Open(o)
+ r = f.GetRunner()
+ if not r.IsRunning():
+ fd=aff4.FACTORY.Open(str(id)+"/fs/os%s"%path, token=token)
+ files.append(str(fd.Read(10000)))
+ flows.remove(o)
+If interested in Mandiant IR (MIR) and its concept, I'd like to
+recommend another Youtube video by Douglas Wilson, which is quite
+awesome as well [7].
+Update 2020: Today I wouldn't recommend MIR/FireEye HX, but rather
+something like LimaCharlie [8] due to the lack of hunting
+capabilities in the HX platform.
+[1] https://github.com/google/grr
+[2] http://www.fireeye.com/products-and-solutions/endpoint-forensics.html
+[3] https://grr-doc.readthedocs.io/en/latest/installing-grr-server/index.html
+[4] https://2459d6dc103cb5933875-c0245c5c937c5dedcca3f1764ecc9b2f.ssl.cf2.rackcdn.com/lisa13/castle.mp4
+[5] GRR: Find All The Badness - https://docs.google.com/file/d/0B1wsLqFoT7i2Z2pxM0wycS1lcjg/edit?pli=1
+[6] Jaffe, Marley. GRR Capstone Final Paper
+[7] NoVA Hackers Doug Wilson - Lessons Learned from using OpenIOC: https://www.youtube.com/watch?v=L-J5DDG_SQ8
+[8] https://www.limacharlie.io/
diff --git a/data/signals-feeds.md b/data/signals-feeds.md
new file mode 100644
index 0000000..531e29e
--- /dev/null
+++ b/data/signals-feeds.md
@@ -0,0 +1,219 @@
+## Key Takeaways
+* It is possible to index and tag a high number of RSS, OTX and
+ Twitter articles on limited computational power in seconds
+* Building logic around timestamps is complex
+* Structuring the resulting data in a graph is meaningful.
+## Introduction
+Today I am sharing some details about one of the multi-year
+projects I am running. The project motivation is:
+> To stay up to date on cyber security developments within days.
+I didn't want a realtime alerting service, but an analysis tool to
+gather important fragments of data over time. These fragments
+makes up the basis of my open source research. The curated
+information usually ends up on a channel like an NNTP feed,
+sometimes with added comments.
+My solution was to create a common interface to ingest and search
+content from third party sources, Achieving this is difficult, and
+requires some work, but I found it feasible.
+Going throught some basic research I found that much of what
+happens on the web eventually ends up on one of the following
+three places (e.g. a mention):
+1. OTX
+2. Twitter
+3. RSS
+After some work I found that there were two things important to me
+in the first iteration:
+1. Being able to recognize the characteristics of the content
+2. Knowing the publish time of the data
+The primary problem was thus to build a program that scales with a
+large number of feeds.
+Going from there I built a prototype in Python, which I've now
+matured into a more performant Golang version. What follows from
+here is my experience from that work.
+The tested component list of the program I am currently running are:
+* Gofeed [1]
+* Badger [2]
+* Apache Janusgraph [3,4]
+* Apache Cassandra [5]
+* Go-Twitter [6]
+* Alienvault OTX API [7]
+* Araddon Dateparse [8]
+[1] https://github.com/mmcdole/gofeed
+[2] https://github.com/dgraph-io/badger
+[3] https://janusgraph.org
+[4] https://docs.janusgraph.org/basics/gremlin/
+[5] https://cassandra.apache.org
+[6] https://github.com/dghubble/go-twitter/twitter
+[7] https://github.com/AlienVault-OTX/OTX-Go-SDK/src/otxapi
+[8] https://github.com/araddon/dateparse
+## The Lesson of Guestimation: Not All Feeds Are Created Equal
+Timestamps is perhaps some of the more challenging things to
+interpret in a crawler and search engine. RSS is a loose standard,
+at least when it comes to implementation. This means that
+timestamps may vary: localized, invalid per the RFC standards,
+ambiguous, missing and so on. Much like the web otherwise. Luckily
+without javascript.
+The goal is simply about recognizing what timestamp are the most
+correct one. A feed may contain one form of timestamp, while a
+website may indicate another one. To solve this I use and compare
+two levels of timestamping:
+* The feed published, updated and all items individual timestamps
+* The item and website last modified timestamps
+Looking back, solving the first level of timestamping was
+straight forward. These timestamps are present in the feed and for
+RSS the logic to build a list of timestamps would look like this:
+ /* First we check the timestamp of all
+ * feed items (including the primary).
+ * We then estimate what is the newest
+ * one */
+ var feedElectedTime time.Time
+ var ts = make(map[string]string)
+ ts["published"] = feed.Published
+ ts["updated"] = feed.Updated
+ var i=0
+ for _, item := range feed.Items {
+ ts[strconv.Itoa(i)] = item.Published
+ i++
+ ts[strconv.Itoa(i)] = item.Updated
+ i++
+ }
+ feedElectedTime, _, err = tsGuestimate(ts, link, false)
+The elected time can be used to compare with a previous feed
+checkpoint to avoid downloading all items again. Using the above
+logic I was also able to dramatically increase the success rate of
+the program, since it requires a valid timestamp. The
+`tsGuestimate` logic is something for a future post.
+Further the item/website timestamps requires a similar method, but in
+addition I found it an advantage to do a HTTP HEAD request to the
+destination URL to combine with the timestamps available from the
+feed. The central and important aspect here is to abort retrieval
+if an item already exists in the database, this is dramatically
+increases the processing in each run.
+False timestamps are a problem. I noticed that websites publish
+feeds with dynamic timestamps, which means that when you retrieve
+the feed it adds the timestamp of now. This obviously creates
+resource-intesive operations since the whole feed is then at risk
+for re-indexing each run.
+## Noise Reduction: Recognizing Content Characteristics
+Retrieving content is possible in several ways. For recognizing the
+content I opted for and have success/good coverage using
+regex. This is also some of the good things of curating articles,
+since this means experience with questions such as "why did I miss
+this article?" evolves into a new iteration of the program input.
+For instance, to stay on top of targeted cyber operations, I found
+that much used phrases in articles was "targeted attack" and
+"spear phishing". So based on that I deployed the following
+keyword search (regular expression) which applies to every new
+item ingested:
+ "targeted":"(?i)targeted\\satt|spear\\sp",
+So a new article containing "targeted attack" in the body or title
+is tagged with a hotword "targeted". Another hotword could be
+Perhaps not surprising this data can be modelled in a graph like
+ Tweet ─> URL in tweet ┌─> Targeted
+ └─> Breach
+## A Practical Example
+Traversing a news graph, we can go from the hotword "targeted", to
+all items and articles for the past days linked to the hotword.
+I use Gremlin for querying. An example is shown below (some
+details omitted):
+ keyw="targeted"
+ _date="2021-02-10"
+ g.V().hasLabel('hotword').has('title',keyw).as("origin_hw").
+ in().in().hasLabel('article:m').has('timestamp',gte(_date)).order().by('timestamp',asc).as('article').
+ .select("origin_hw","article").by(values('title','timestamp'))
+The procedure above summarized:
+1. Find the node with the keyword "targeted"
+2. Find all articles (for instance a tweet) that are two steps out
+ from the keyword (since these may be linked via a content node)
+3. Get title and timestamp from hotword and tweet
+Using a match, which was incidentally not a tweet but an article,
+from a RSS feed, we find the following:
+Retrieving the article with Gremlin, we can decide the source:
+ gremlin > g.V().has('title','WINDOWS KERNEL ZERO-DAY EXPLOIT (CVE-2021-1732) IS USED BY BITTER APT IN TARGETED ATTACK').valueMap()
+ =>{link=[https://www.reddit.com/r/netsec/.rss],
+ src=[Reddit - NetSec],
+ src_type=[rss],
+ sha256=[8a285ce1b6d157f83d9469c06b6accaa514c794042ae7243056292d4ea245daf],
+ added=[2021-02-12 10:42:16.640587 +0100 CET],
+ timestamp=[2021-02-10 20:31:06 +0000 +0000],
+ version=[1]}
+ ==>{link=[http://www.reddit.com/r/Malware/.rss],
+ src=[Reddit - Malware],
+ src_type=[rss],
+ sha256=[69737b754a7d9605d11aecff730ca3fc244c319f35174a7b37dd0d1846a823b7],
+ added=[2021-02-12 10:41:48.510538 +0100 CET],
+ timestamp=[2021-02-10 20:35:11 +0000 +0000],
+ version=[1]}
+In this instance the source was two Reddit posts which triggered
+the keyword in question and others about a targeted incident in
+China. Additionally this triggered a zero day hotword.
+## Summary
+Through this post I have shown some key parts of how to build a
+feed aggregator that can scale to thousands of feeds on a single
+computer, with update times in seconds.
+I have also given a brief view on how Janusgraph and similar
+systems can be used to model such data in a way which makes it
+possible to search, find and eventually stay up to date on
+relevant information to cyber security.
+When in place such a system may save hours per day since the data
+is normalised and searchable in one place.
diff --git a/data/ssh-ca-proxyjump.md b/data/ssh-ca-proxyjump.md
new file mode 100644
index 0000000..af24dcc
--- /dev/null
+++ b/data/ssh-ca-proxyjump.md
@@ -0,0 +1,228 @@
+## Key Takeaways
+* SSH has a key-signing concept that in combination with a
+ smartcard provides a lean, off-disk process
+* A SSH-CA provides the possibility of managing access
+ without a central point of failure
+* The use of SSH Jumphost is an easier way to tunnel
+ sessions end-to-end encrypted, while still maintaining
+ visibility and control through a central point
+## Introduction
+This post is an all-in-one capture of my recent discoveries with
+SSH. It is an introduction for a technical audience.
+It turns out that SSH is ready for a zero trust and
+microsegmentation approach, which is important for
+management of servers. Everything described in this post is
+available as open source software, but some parts require a
+smartcard or two, such as a Yubikey (or a Nitrokey if you
+prefer open source. I describe both).
+I also go into detail on how to configure the CA key without
+letting the key touch the computer, which is an important
+The end-result should be a more an architecture providing a better
+overview of the infrastructure and a second logon-factor
+independent of phones and OATH.
+## SSH-CA
+My exploration started when I read a 2016-article by
+Facebook engineering [1]. Surprised, but concerned with the
+configuration overhead and reliability I set out to test the
+SSH-CA concept. Two days later all my servers were on a new
+SSH-CA works predictably like follows:
+ [ User generates key on Yubikey ]
+ |
+ |
+ v
+ [ ssh-keygen generates CA key ] --------> [ signs pubkey of Yubikey ]
+ | - for a set of security zones
+ | - for users
+ | |
+ | |
+ | v
+ v pubkey cert is distributed to user
+ [ CA cert and zones pushed to servers ] - id_rsa-cert.pub
+ - auth_principals/root (root-everywhere)
+ - auth_principals/web (zone-web)
+The commands required in a nutshell:
+ # on client
+ $ ssh-keygen -t rsa
+ # on server
+ $ ssh-keygen -C CA -f ca
+ $ ssh-keygen -s ca -I -n zone-web -V +1w -z 1 id_ecdsa.pub
+ # on client
+ cp id_ecdsa-cert.pub ~/.ssh/
+Please refer to the next section for a best practice storage
+of your private key.
+On the SSH server, add the following to the SSHD config:
+ TrustedUserCAKeys /etc/ssh/ca.pub
+ AuthorizedPrincipalsFile /etc/ssh/auth_principals/%u
+What was conceptually new for me was principals and
+authorization files per server. This is how it works:
+1. Add a security zone, like zone-web, during certificate
+ signing - "ssh-keygen * -n zone-web *". Local username does
+ not matter
+2. Add a file per user on the SSH server, where zone-web
+ is added where applicable -
+ e.g. "/etc/ssh/auth_principals/some-user" contains "zone-web"
+3. Login with the same user as given in the zone file - "ssh some-user@server"
+This is the same as applying a role instead of a name to the
+authorization system, while something that IDs the user is
+added to certificate and logged when used.
+This leaves us with a way better authorization and
+authentication scheme than authorized_keys that everyone
+uses. Read on to get the details for generating the CA key
+## Keeping Private Keys Off-disk
+An important principle I have about private keys is to
+rather cross-sign and encrypt two keys than to store one on
+disk. This was challenged for the SSH-CA design. Luckily I found
+an article describing the details of PKCS11 with ssh-keygen
+> If you're using pkcs11 tokens to hold your ssh key, you
+> may need to run ssh-keygen -D $PKCS11_MODULE_PATH
+> ~/.ssh/id_rsa.pub so that you have a public key to
+> sign. If your CA private key is being held in a pkcs11
+> token, you can use the -D parameter, in this case the -s
+> parameter has to point to the public key of the CA.
+Yubikeys on macOS 11 (Big Sur) requires the yubico-piv-tool
+to provide PKCS#11 drivers. It can be installed using
+ $ brew install yubico-piv-tool
+ $ PKCS11_MODULE_PATH=/usr/local/lib/libykcs11.dylib
+Similarly the procedure for Nitrokey are:
+ $ brew cask install opensc
+ $ PKCS11_MODULE_PATH=/usr/local/lib/opensc-pkcs11.so
+Generating a key on-card for Yubikey:
+ $ yubico-piv-tool -s 9a -a generate -o public.pem
+For the Nitrokey:
+ $ pkcs11-tool -l --login-type so --keypairgen --key-type RSA:2048
+Using the exported CA pubkey and the private key on-card a
+certificate may now be signed and distributed to the user.
+ $ ssh-keygen -D $PKCS11_MODULE_PATH -e > ca.pub
+ $ ssh-keygen -D $PKCS11_MODULE_PATH -s ca.pub -I example -n zone-web -V +1w -z 1 id_rsa.pub
+ Enter PIN for 'OpenPGP card (User PIN)':
+ Signed user key .ssh/id_rsa-cert.pub: id "example" serial 1 for zone-web valid from 2020-10-13T15:09:00 to 2020-10-20T15:10:40
+The same concept goes for a user smart-card, except that is
+a plug and play as long as you have the gpg-agent
+running. When the id_rsa-cert.pub (the signed certificate of
+e.g. a Yubikey) is located in ~/.ssh, SSH will find the
+corresponding private key automatically. The workflow will
+be something along these lines:
+ [ User smartcard ] -----------> [ CA smartcard ]
+ ^ id_rsa.pub |
+ | | signs
+ |------------------------------|
+ sends back id_rsa-cert.pub
+## A Simple Bastion Host Setup
+The other thing I wanted to mention was the -J option of
+ssh, ProxyJump.
+ProxyJump allows a user to confidentially, without risk of a
+man-in-the-middle (MitM), to tunnel the session through a
+central bastion host end-to-end encrypted.
+Having end-to-end encryption for an SSH proxy may seem
+counter-intuitive since it cannot inspect the
+content. I believe it is the better option due to:
+* It is a usability compromise, but also a security
+ compromise in case the bastion host is compromised.
+* Network access and application authentication (and even
+ authorization) goes through a hardened point.
+* In addition the end-point should also log what happens on
+ the server to a central syslog server.
+* A bastion host should always be positioned in front of the
+ server segments, not on the infrastructure perimeter.
+A simple setup looks like the following:
+ [ client ] ---> [ bastion host ] ---> [ server ]
+Practically speaking a standalone command will look like
+ ssh -J jump.example.com dest.example.com
+An equivalent .ssh/config will look like:
+ Host j.example.com
+ HostName j.example.com
+ User sshjump
+ Port 22
+ Host dest.example.com
+ HostName dest.example.com
+ ProxyJump j.example.com
+ User some-user
+ Port 22
+With the above configuration the user can compress the
+ProxyJump SSH-command to "ssh dest.example.com".
+## Further Work
+The basic design shown above requires one factor which is
+probably not acceptable in larger companies: someone needs
+to manually sign and rotate certificates. There are some
+options mentioned in open sources, where it is normally to
+avoid having certificates on clients and having an
+authorization gateway with SSO. This does however introduce
+a weakness in the chain.
+I am also interested in using SSH certificates on iOS, but
+that has turned out to be unsupported in all apps I have
+tested so far. It is however on the roadmap of Termius,
+hopefully in the near-future. Follow updates on this subject
+on my Honk thread about it [4].
+For a smaller infrastructure like mine, I have found the
+manual approach to be sufficient so far.
+[1] Scalable and secure access with SSH: https://engineering.fb.com/security/scalable-and-secure-access-with-ssh/
+[2] Using a CA with SSH: https://www.lorier.net/docs/ssh-ca.html
+[3] Using PIV for SSH through PKCS #11:
+[4] https://cybsec.network/u/tommy/h/q1g4YC31q45CT4SPK4
diff --git a/data/ssh-certs-apple-t2.md b/data/ssh-certs-apple-t2.md
new file mode 100644
index 0000000..7784133
--- /dev/null
+++ b/data/ssh-certs-apple-t2.md
@@ -0,0 +1,94 @@
+## Key Takeaways
+* SSH certificates can be used with the Apple T2 chip on
+ macOS as an alternative to external smart cards,
+ authenticated with a fingerprint per session.
+* The Mac T2 chip serves as an extra security layer by creating
+ private keys in the secure enclave.
+* The CA can be stored on an external smartcard, only
+ signing for access in a limited period - again limiting
+ the exposure.
+## Introduction
+Over the past days I have been going down a deep, deep
+rabbit hole of SSH proxy jumping and SSH certificates
+combined with smart cards.
+After playing around with smart cards for SSH, I recognized
+that not only external smart cards such as the Yubikey or
+Nitrokey is a possible lane to go down.
+Mac computers comes with a security chip called T2. This chip is
+also known to host something Apple calls Secure Enclave [1]. In
+the Secure Enclave you can store keys.
+It will probably not serve as an equally secure solution as with
+external smart cards, but it is a better balance for usability.
+The T2 is permanently stored in hardware on one host only,
+so the access needs to be signed on a per-host basis. In
+such I would say the T2 and external smart cards complement
+each other.
+Always having the key available will bring two additional
+* If compromised, the key is always available logically
+* Separation of equipment and key is not possible e.g. in a
+ travel situation
+With a central pubkey directory tied to an identity
+(automated), the T2 can be of better use for an enterprise
+## Setting up a Private Key in Secure Enclave
+While fiddling around I found sekey on Github [2]. The
+project seems abandoned, but it is the secure enclave that
+does the heavy lifting.
+The short and easy setup are:
+ $ brew cask install sekey
+ $ echo "export SSH_AUTH_SOCK=$HOME/.sekey/ssh-agent.ssh" >> ~/.zshrc
+ $ echo "IdentityAgent ~/.sekey/ssh-agent.ssh" >> ~/.ssh/config
+ $ source ~/.zshrc
+A keypair can now be generated in the secure enclave by:
+ $ sekey --generate-keypair SSH
+ $ sekey --list-keys
+Now export the public key of the curve generated on-chip:
+ $ sekey --export-key > id_ecdsa.pub
+Using the trick we found in our recent venture into using
+smart cards for signing the key, we can used PCKS#11 without
+compromising security [3]. In this case I use a Nitrokey:
+ $ brew cask install opensc
+ $ PKCS11_MODULE_PATH=/usr/local/lib/opensc-pkcs11.so
+ $ ssh-keygen -D $PKCS11_MODULE_PATH -e > ca.pub
+ $ ssh-keygen -D $PKCS11_MODULE_PATH -s ca.pub -I example -n zone-web -V +1h -z 1 id_ecdsa.pub
+ Enter PIN for 'OpenPGP card (User PIN)':
+ Signed user key id_ecdsa-cert.pub: id "example" serial 1 for zone-web valid from 2020-10-14T20:26:00 to 2020-10-14T21:27:51
+ cp id_ecdsa-cert.pub ~/.ssh/
+If you now try to ssh into a server using the given
+certificate authority as shown in the SSH-CA post [3],
+access should be granted with a fingerprint.
+## A Word of Caution
+The T2 has some vulnerabilities shown recently [4]. Make
+sure to include these in your risk assessment of using
+it. If you won't go down the smart card route it will still
+be better than storing the key on disk.
+[1] https://support.apple.com/guide/security/secure-enclave-overview-sec59b0b31ff/web
+[2] https://github.com/sekey/sekey
+[3] https://secdiary.com/2020-10-13-ssh-ca-proxyjump.html
+[4] https://inks.cybsec.network/tag/t2
diff --git a/data/telemetry.md b/data/telemetry.md
new file mode 100644
index 0000000..d781fdd
--- /dev/null
+++ b/data/telemetry.md
@@ -0,0 +1,250 @@
+Telemetry for cyber security is currently at a
+crossroads. While past methods have been efficient by being
+based on network monitoring, the current revolution in
+encryption and the distributed workspace makes it
+insufficient to solely rely on network monitoring. Through
+this post we are going to focus on the current challenges.
+> Telemetry is an electrical apparatus for measuring a
+> quantity (such as pressure, speed, or temperature) and
+> transmitting the result especially by radio to a distant
+> station
+> – Meriam Webster
+Telemetry, a term mostly used by AV-vendors, have become
+broadly applied as services change from a central to
+decentralised geographically spread. Yesterday an employee
+would work at his desk from 9-5 and then go home, while
+today's modern worker moves around the office area and can
+basically work from anywhere in the world when they feel
+like it.
+In cyber security, telemetry can generally be categorised
+in: 1) Network-centric and 2) endpoint-based. A complete
+telemetry profile is essential for being able to monitor
+security events and to execute retrospective
+analysis. Through my recent article on indicators [1] I
+proposed a structure for indicators organised in three
+levels of abstraction. In this article a telemetry profile
+means something that covers a degree of these three levels.
+ | Level of abstraction | | Formats
+ |-----------------------|----|-------------
+ | Behavior | | MITRE (PRE-)ATT&CK
+ |-----------------------|--->|-------------
+ | Derived | | Suricata+Lua, Yara
+ |-----------------------|--->|-------------
+ | Atomic | | OpenIOC 1.1
+## The Challenges
+There are generally two problems that needs to be fully
+solved when collecting data for cyber security:
+* The use of encryption from end-to-end
+* Workers and thereby the defended environment are or will be distributed
+As of February 2017 the web was 50% encrypted [2]. Today
+that number [3] is growing close to 70%.
+For defense purposes, it is possible to identify malicous
+traffic, such as beaconing, through metadata analysis. There
+have been some developments on detecting anomalies in
+encrypted content lately - namely the fingerprinting of
+programs using SSL/TLS. In the future I believe this will be
+the primary role of network-based detection. This is
+actually a flashback to a pre-2010 monitoring environment
+when full content was rarely stored and inspected by
+security teams.
+An additional element to consider is the previous debate
+about public key pinning, which has now evolved into
+Expect-CT [4]. This means that man in the middle (MitM)
+techniques is going to be a no-no at some point. Yes, that
+includes your corporate proxy as well.
+There is one drawback and dealbreaker with the above for
+security teams: it requires access to the datastream used by
+the endpoints to be fully effective.
+VPNs are going away as more resilient and modern network
+architectures will become dominating. The most promising
+challenger at the moment is the Beyondcorp [5] (based on
+zero trust) architecture proposed by Google more than six
+years ago. A zero trust architecture means that clients will
+only check in to the corporate environment at the points
+that _they_ need or are in the vicinity of corporate
+resources. Other activity, such as browsing on external
+websites are actually no longer going via the corporate
+infrastructure or its monitored links. Additionally, the
+endpoint is easily the most common infiltration vector.
+To be honest, the Beyondcorp model reflects to a larger
+extent how humans actually interact with computers. Humans
+have never been confined to the perimeter of the enterprise
+network. This may be some of the reason for organisations
+being in a currently defeatable state as well. The only ones
+to confine themselves to the enterprise network is
+ironically the network defenders.
+> The only ones to confine themselves to the enterprise network is
+> ironically the network defenders.
+The battle of controlling the technology evolution is not
+completely lost though, it is a matter of changing the
+mindset of where data or telemetry is collected. Yesterday
+it was at the corporate proxy or in the corporate
+environment - today it is on the endpoint and during the
+connections to valuable resources.
+For endpoints, the primary challenges currently faced are:
+* Maintaining the integrity of locally stored and buffered data
+* The availability and transport of data to a centralised logging instance
+* Confidentiality of the data in transport or at rest
+* Data source consistency for central correlation of information from several
+ host sources
+* Raising the stakes on operational security in a cat and mouse
+ chase between intruders and defenders
+Remote logging is a subject that has gained much publicity
+previously, so we are not going into depth about that here.
+### Existing Tooling For Endpoints
+This section was not originally a part of the scope of this
+article, but I'd like to establish a baseline of parts of
+the available tooling to handle the above issues. I also
+believe it touches some of the endpoint challenges.
+For the purpose of this article, we define the following
+well-known computer abstraction stack:
+1. Hardware
+2. Operating System
+3. Application
+Hardware verification and logging is currently a more or
+less unexplored field, with primarily only one tool
+available to my knowlege. That tool is Chipsec [6] which has
+been of interest and integrated into the Google Rapid
+Response (GRR) [7] project for some time.
+Operating system logs are well understood today, and many
+organisations manages logging from the host operating system
+There are increasingly good event streaming and agent-based
+systems available, such as LimaCharlie [8], Sysmon [9] and
+Carbon Black [10]. The media focus of these platforms are on
+the more trendy term "hunting", but their real purpose is
+OS-level logging and pattern matching.
+Further, distributed forensic platforms are available from
+FireEye (HX) and an open source equivalent from Google named
+GRR. GRR have been featured extensively on this site
+previously. Common for these are that they do not stream
+events, but rather stores information on the endpoint.
+Application layer logging is extremely challenging. The
+logging mechanism in this regard needs to be connected to
+the structure of the application itself, and there are a lot
+of applications. Further, many application developers does
+not focus on logging.
+Application logging is important and could be seen as the
+technical contextual information provided by the
+endpoint. Exposed applications that are important in terms
+of coverage:
+* Browsers
+* Email Readers
+* Application Firewalls (if you have one)
+* Instant Messaging Clients
+* Rich Document editors, such as Excel, Word, Powerpoint
+These applications are important since they are the first
+point of contact for almost any technical threat. Done
+right, application logs will be at a central location before
+the intruder manages to get a foothold on the client. Thus,
+the risk of data being misrepresented in the central system
+are highly reduced (integrity).
+Taking browsers and Microsoft Office as an example, there
+are some options readily available:
+* Firefox HTTP and DNS logging: mozilla.org [11]
+* Office Telemetry logging: Office Telemetry Log [12]
+The above examples are not security focused as far as I
+could tell, more often they are debug oriented. However, the
+same data is often what we are after as well (such as: did
+the document have a macro? or what is the HTTP header?).
+The dependency on the application developers to create
+logging mechanisms is quite a challenge in this
+arena. However, I believe the solutions in cases where
+applications does not log sufficiently is to take advantage
+of plugins. Most modern applications supports plugins to
+some extent.
+To summarise the tooling discussion, we can populate the
+computer abstraction layers with the mentioned tools.
+ | Level of abstraction | | Tools
+ |-----------------------|----|-------------
+ | Application | | Browser, Email and so on
+ |-----------------------|--->|-------------
+ | Operating System | | LC, CB, Sysmon,
+ |-----------------------|--->|-------------
+ | Hardware | | Chipsec
+## Conclusions: How Do We Defend in The Future?
+In this article we have defined a structure and discussed in
+short one of the most prominent challenges faced by
+enterprise defenders today: how do we defend in the future?
+Technology. This is the point were technology alone is no
+longer the sole solution to defending a network. Modern
+network architectures means that defenders needs to be able
+to fully comprehend and use the human nature as sensors. It
+is also about building intuitive systems which makes the
+necessary data and information available to the
+defenders. In my mind technology has never been the sole
+solution either, so the technology evolution is for the
+greater good.
+It seems obvious and unavoidable to me that network
+defenders must start looking outside the perimeter, just as
+intruders have done for many years already. This means
+adapting the toolsets available and lobbying for an
+architecture that reflects how humans actually use
+technology resources. Most people have owned private
+equipment for many years (surprise), and the line between
+employee and enterprise is blurred and confusing when
+realitity now sinks in.
+This means, in the technology aspect, that an emphasis must
+be put on the endpoints - and that network monitoring must
+again be about the metadata of the activity. In short:
+collect metadata from networks and content from endpoints.
+Only this way will we, in the future, be able to create a
+full telemetry profile from each device under our
+[1] Article on indicators: /indicators/
+[2] 50% encrypted: https://www.eff.org/deeplinks/2017/02/were-halfway-encrypting-entire-web
+[3] that number: https://letsencrypt.org/stats/
+[4] Expect-CT: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expect-CT
+[5] Beyondcorp: https://cloud.google.com/beyondcorp/
+[6] Chipsec: https://github.com/chipsec/chipsec
+[7] Google Rapid Response (GRR): https://github.com/google/grr-doc/blob/master/publications.adoc
+[8] LimaCharlie: https://github.com/refractionPOINT/lce_doc/blob/master/README.md
+[9] Sysmon: https://www.rsaconference.com/writable/presentations/file_upload/hta-w05-tracking_hackers_on_your_network_with_sysinternals_sysmon.pdf
+[10] Carbon Black: http://the.report/assets/Advanced-Threat-Hunting-with-Carbon-Black.pdf
+[11] mozilla.org: https://developer.mozilla.org/en-US/docs/Mozilla/Debugging/HTTP_logging
+[12] Office Telemetry Log: https://msdn.microsoft.com/en-us/library/office/jj230106.aspx
diff --git a/data/travel.md b/data/travel.md
new file mode 100644
index 0000000..2724f40
--- /dev/null
+++ b/data/travel.md
@@ -0,0 +1,361 @@
+Travelling with electronic devices is a challenge, and this is
+certainly the case if you do not have a travel program for your
+employees, where you must tinker with a new setup on a case by
+case basis. The complexity of the matter is though, even when it
+comes to resources, as it requires full time attention.
+Some organisations choose to ignore the problem all together,
+others again does not fully respect their own threat model. The
+latter may be just as dangerous, as it may lead to a false sense
+of security for the travellers.
+This article is about establishing a technical laptop setup that
+can be re-used with ease. Thus, other operational and strategic
+aspects are left out. The information presented evolves around
+organisations, but might as well apply for a private travel of
+exposed individuals.
+## Main Drivers
+With that out of the way: multiple overall factors are left for
+consideration. The following factors are the main drivers and
+equally important when developing a technical model of an abroad
+* Threat resiliency. Equipment on travel can really never be
+ secured well enough, but it can be hardened to the degree that a
+ threat actor needs to risk exposure to compromise it
+* Usability for the traveller. Equipment that feels inconvenient
+ will be avoided by the traveller at some point
+* Usability for the supporting organisation (both security and IT
+ operations). Such setups may require much time and attention to
+ develop and if there are an increasing number of travellers to
+ high risk areas the setup needs to scale
+* Cost. A travel program is a balance between environment,
+ security and cost. If the cost and environmental impact
+ surpasses the value that needs to be secured, the travel program
+ misses some of its value. Critical infrastructure organisations
+ is a different ball game than other industries on this point.
+When it comes to threats, the most prominent one is the evil maid
+infiltration vector - which is basically someone gaining physical
+access to a computer. Motherboard recently published an article
+on how a malicious party could add a backdoor to a Dell (example
+used) laptop in less than 5 minutes [1].
+Other examples of relevant techniques used against travellers are:
+electronic eavesdropping using cell networks, physical monitoring
+of hotel rooms (e.g. camera surveillance), malicious charging
+stations and so on. More details on general infiltration
+techniques can be found in the Mitre ATT&CK's "Initial Access"
+category (each described on their Wiki [1,2].
+## Conceptual Overview
+Now that we have reviewed the main drivers, the question is if you
+can protect against the given threat model in an easily achievable
+way. To assess that we will first have to a look at an conceptual
+model for travel. Taking a top-down approach, the travel setup
+will in most cases consist of two components:
+1. The devices used for travel
+2. The server side infrastructure
+There are arguments for a standalone operation, but the legal
+ramification and practical impact of sending an employee into a
+hostile environment with anything but local encryption is risky at
+best. To note: that is, if the user will actually produce or carry
+anything of value. If not, a standalone setup may in some cases be
+argued for.
+Tactical no-brainers when travelling are the following:
+1. The system should disclose as little as possible about the
+ traveller's pattern of activity and content
+2. As little information as possible should be at rest on devices
+ at risk
+3. It should come at a high cost to compromise the end-point both
+ for physical and technical exploitation
+4. The equipment should never be connected to an organisation's
+ service infrastructure directly before, during or after travel
+5. The system should not be obviously provocative to locals -
+ e.g. during airport inspections.
+As far as I have found, there are currently one desktop system
+that sufficiently meet these criterions - and that is ChromeOS
+which comes with sane default settings, has a really minimal
+configuration and is usable to an average person. However,
+ChromeOS is not a mobile operating system - and for that purpose
+iOS and Android is a better fit even though they do not tick off
+all the above boxes.
+With that in mind the following model, that I have named "The
+Tactical Travel Protection Model", provides a hardened, basic
+infrastructure setup that uses cloud providers to hide in plain
+![The Tactical Travel Protection Model shows the concept of a full stack travel
+The model further detailed in the following section.
+## Scalability and Technical Implementation
+With the conceptual model shown in the last section, it is time to
+dive into implementation in a practical situation. The beauty of
+the model is its modularity, so a component - such as a cloud
+server, can easily be put in a local and physically controlled
+location. Thus, please consider the technologies mentioned as an
+example - the power of the model comes to play when you start
+switching things up.
+### Server Side Components
+Consider theavailability of external services in all parts of the
+process. Ideally a travel device should store information only
+outside the regional location of a traveller. Balance storage
+with requirements of availability. An example of such is that an
+enforced VPN connection may not always be available, which would
+practically leave an SFTP link exposed or down.
+For the example technologies used in the model shown in the
+previous section, following sections shows the use.
+#### Cloud Policy, Provisioning, Device and User Management
+The reason we really need to use a device management service is
+the scalability of deployment. Using a standalone approach may
+work and provide some additional security due to the independence
+of each device, but it is inevitable in the long run if you handle
+even a low amount of travels.
+In this case, especially due to using ChromeOS, G Suite is the
+most straightforward choice. It is important to focus the solution
+on managing devices when speaking of travels, not pushing
+sensitive configuration files and so on. If encountering a
+compromise of the G Suite administrative account - it is possible
+to push threat actor-controlled applications and configurations to
+devices. Due to this it is essential to clean out the management
+domain or create a new, untraceable one once in a while.
+G Suite is a granular solution. Examples of recommended policies
+are: enforced use of security tokens and the disabling of other
+two factor authentication options, screen lock upon lid close and
+so on.
+When testing G Suite and ChromeOS I figured that it is easiest to
+provision VPN configuration files (``.onc``) and certificates
+manually. For iOS the same goes with ``.mobileconfig``. Doing this
+adds another protective layer.
+#### VPN
+For VPN, my experience is that the most reliable option is using
+native supported VPN clients in the operating system used for
+travel. In this case it is ChromeOS with OpenVPN and iOS with
+IPSec. This adds a bit to the complexity as iOS does not support
+OpenVPN which runs most reliably in some countries that censors
+the Internet. However, ChromeOS does. The solution to this is
+using two VPS nodes for tunneling traffic:
+1. OpenVPN service through ansible-openvpn-hardened [4]
+2. IPSec service through [5]. Lenny Zeltser created a
+ deployment-guide on algo recently [6]
+Again: to reduce exposure through centrality, you should not
+provision device-specific keys from central management
+consoles. Also, make sure to use certificates by any service that
+needs to connect to the Internet.
+Configure according to the README on the
+``ansible-openvpn-hardened`` Github page. When you deploy the
+OpenVPN server, you will be left with a file named something like
+``@.preregistration-pki-embedded.ovpn`` in
+the ``fetched_credentials/`` directory. Just like Apple
+has its ``mobileconfig`` format, the Chromium Project uses the
+Open Network Configuration (ONC) [7]. In order to convert this
+format to a working configuration file, use ovpn2onc.py [9] like
+the following.
+ python3 reference/convert.py --infile *-pki-embedded.ovpn --outfile vpn_configuration.onc --name my_vpn
+This results in a configuration file named
+``vpn_configuration.onc``. ChromeOS will not give you any feedback
+here, so make sure to read through everything to get it right the
+first time. If you end up troubleshooting, I found that the
+Chromium project do have some working examples [9]. Import
+``vpn_configuration.onc`` in Chrome as shown in the next section.
+Due to the hardened setup, be particularly strict to configure
+with an OS version according to the repo README. For instance
+Debian 8.10 won't work.
+**Algo**: Has great docs as-is.
+#### SFTP
+An SFTP service is simple to manually deploy. However, when
+scalability hardening matters it is best to automate the
+deployment. Through testing available Ansible scripts I ended up
+with Johan Meiring's ansible-sftp [10]. Again, the configuration
+is self-explanatory. You should however note that public
+keys should be put in a ``files/`` directory under
+``ansible-sftp`` root. These can be generated with
+``ssh-keygen``, the private keys needs to be stored somewhere else
+for manual transfer to the laptop accessing it.
+Since this is a traveller setup you should seek to create a
+disconnect between cloud drives and rather use local storage and
+SFTP. Disable OneDrive in Office 365 Business and Google Drive in
+G Suite.
+#### Deploying an Out-of-Band (OOB) Channel
+Communications is king and perhaps one of the most important
+things you configure.
+I described using Matrix and Riot for OOB recently [11].
+#### Security Keys
+Nowadays, strong authentication is so easy that everyone should
+use it. In a hostile environment it is hygiene. Google uses
+Yubikeys and Feitian tokens in their authentication services and
+so should a traveller [12,13,14]. This eliminates some of the
+uncertainty when authenticating against remote servers and is
+something the traveller can keep on-body at all times. For this
+setup not every service can maintain usability when using
+tokens. Those services - such as a mounted SFTP share should use
+### Client Side Components
+So why a Chromebook?
+* Has a minimal configuration. Everything you do is in the
+ browser
+* You get granular control through G Suite
+* Based on the Linux-kernel, which means it is different from
+ Windows and may require some extra effort from a threat actor
+* A lot of work has gone in to the user interface in ChromeOS, so
+ it will feel familiar and intuitive to users
+* ChromeOS has a lot of security features built-in [15], such as:
+ Secure Boot, Security Key login and so on.
+G Suite will help you a little bit on the way when it comes to
+configuration control. However, it requires some client-side
+The client side consists of components. I chose to model these as
+five layers:
+The Traveller. The most important asset on the travel is most
+likely your human traveller. This asset will have some values
+assigned to it, such as security keys, credentials and his own
+knowledge. Anonymise information stored here. In other words,
+make sure to use an identifier and not the travellers real name.
+Device and information. When selecting devices and putting
+information on it you have entered the device and information
+exposure layer. This will typically consist of all hardware
+peripherals, such as cameras, and content such as calls made from
+a handset. Other things to consider here for ChromeOS is deploying
+PGP and its keys with Mailvelope and Office from Google Play
+Content. It was actually kind of interesting to model this from an
+iOS and ChromeOS perspective, because ChromeOS keeps most of its
+applications in the browser while iOS has native apps on line with
+Chrome. This again means that the exposure surface of ChromeOS is
+more uniform than on iOS.
+Native applications. This is the actual applications installed in
+the operating system directly. For iOS this has larger exposure
+with native applications for e.g. communications, while on
+ChromeOS you will basically only install an SFTP plugin to the
+file system and use Chrome for a travel.
+Transport. When travelling to a hostile environment, tunnel all
+communications to and from the system as far as possible. Both iOS
+and ChromeOS has sufficient mechanisms here as we reviewed in the
+previous section. For encryption keys:
+1. Transfer encryption keys stored in the ``.p12`` file and the
+ configuration to the Chromebook
+2. Install encryption keys in
+ ``chrome://settings/certificates``. Use the "Import and Bind"
+ option to install the certificate to TPM
+2. Import the VPN configuration (ONC) in
+ ``chrome://net-internals/#chromeos``
+That is basically it.
+## Conclusion
+The art of balancing threat resiliency, usability and cost is an
+intriguing problem.
+The technology out there, presented in this article, is in no way
+designed to survive in hostile environments when considering the
+capabilities of nation state threat actors. Fundamental security
+mechanisms are lacking in this regard, and only companies like
+Microsoft, Google and Apple can provide the basis to change
+those. We can however slow these actors down considerably.
+An important aspect to consider, in order to compensate for the
+above weaknesses, is that organisations needs to handle these
+problems on an operational and strategic level as well.
+Using cloud environments are a solid choice for travel. However,
+when considering threat actors that are able to gain access to the
+hosts of those environments they are not sufficient. To solve
+this, the most valuable services may be moved in-house or to a
+hardened cloud environment. End-to-end encryption is also required
+when using cloud services, such as when using the included inbox
+of G Suite.
+Please keep in mind that The Tactical Traveler Protection Model is
+a core model. This article does not cover every aspect. An
+example of such is encryption and protection of external
+peripherals and memory devices and operational and strategic
+Organisations have yet to prove a working model resilient to
+capable adversaries. Hopefully this article will be a foundation
+to discuss variations and weaknesses in the community.
+[1] https://motherboard.vice.com/en_us/article/a3q374/hacker-bios-firmware-backdoor-evil-maid-attack-laptop-5-minutes
+[2] https://mitre.github.io/attack-navigator/enterprise/
+[3] https://attack.mitre.org/wiki/Initial_Access
+[4] https://github.com/bau-sec/ansible-openvpn-hardened
+[5] https://github.com/trailofbits/algo
+[6] https://zeltser.com/deploy-algo-vpn-digital-ocean/
+[7] https://www.chromium.org/chromium-os/chromiumos-design-docs/open-network-configuration
+[8] https://gist.github.com/tommyskg/6d0eeecc5bab65a49d72f5b16e086976
+[9] https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chromeos/test/data/network
+[10] https://github.com/johanmeiring/ansible-sftp
+[11] https://secdiary.com/2018-07-11-matrix.html
+[12] https://krebsonsecurity.com/2018/07/google-security-keys-neutralized-employee-phishing/
+[13] https://www.yubico.com/product/yubikey-4-series/#yubikey-4c
+[14] https://ftsafe.com/onlinestore/product?id=3
+[15] http://dhanus.mit.edu/docs/ChromeOSSecurity.pdf
diff --git a/data/vantage.md b/data/vantage.md
new file mode 100644
index 0000000..c6cf8ce
--- /dev/null
+++ b/data/vantage.md
@@ -0,0 +1,222 @@
+## Key Takeaways
+* Monitoring the technology infrastructure is a key element for
+ situational awareness in both security and IT operations.
+* A 2020 infrastructure should use a modern application layer
+ reverse proxy such as Pomerium in front of all services. Leave
+ all clients outside.
+* The threat landscape should be the focus when shaping a
+ defendable infrastructure.
+Disclaimer: If you have outsourced all your equipment
+and information to "the cloud", this post is a sanity check of the
+relationship with your vendor. The primary audience of this post
+is everyone willing to invest in people and knowledge to provide a
+best possible defense for their people and processes, and the
+technology supporting them.
+## Introduction
+I cannot start to imagine how many times Sun Tzu must have been
+quoted in board rooms around the world:
+> If you know the enemy and know yourself, you need not fear the
+> result of a hundred battles. If you know yourself but not the
+> enemy, for every victory gained you will also suffer a
+> defeat. If you know neither the enemy nor yourself, you will
+> succumb in every battle.
+However much repeated, the message has not come across. Why is
+that? Because this is a hard problem to solve. It is in the
+intersection between people as a culture and technology.
+If all used reverse proxies in a sensible way I would probably
+have a lot less to do at work. Time and time again it turns out
+that organisations do not have configuration control over their
+applications and infrastructure, and the reverse proxy is a
+central building block in gaining it. To an extent everything is
+about logs and traceability when an incident occurs.
+## Beyondcorp and The Defendable Infrastructure
+The lucky part of this hard-to-solve problem is that Google has
+already prescribed one good solution in its Beyondcorp whitepapers
+But this was in some ways described in the Norwegian Armed Forces
+before that in its five architecture principles for a defendable
+infrastructure. These were published by its former Head of Section
+Critical Infrastructure Protection Centre [2]:
+1. Monitor the network for situational awareness
+2. A defender must be able to shape the battleground to have
+ freedom of movement and to limit the opponent's freedom of
+ movement
+3. Update services to limit vulnerability exposure
+4. Minimize the infrastructure to limit the attack
+ surface
+5. Traceability is important to analyze what happened
+I know that Richard Bejtlich was an inspiration for the defendable
+infrastructure principles, so the books written by him is relevant
+Defendable infrastructure is a good term, and also used in a 2019
+Lockheed article which defines it well [3]:
+> Classical security engineering and architecture has been trying
+> to solve the wrong problem. It is not sufficient to try to build
+> hardened systems; instead we must build systems that are
+> defendable. A system’s requirements, design, or test results can’t
+> be declared as "secure." Rather, it is a combination of how the
+> system is designed, built, operated, and defended that ultimately
+> protects the system and its assets over time. Because adversaries
+> adapt their own techniques based on changing objectives and
+> opportunities, systems and enterprises must be actively defended.
+The development of these architecture principles happened before
+2010, so the question remains how they apply in 2020. We may get
+back to the other principles in later posts, but the rest of this
+article will focus on monitoring in a 2020-perspective.
+## Monitoring - a Central Vantage Point
+One thing that has developed since 2010 is our understanding of
+positioning monitoring capabilities and the more mainstream
+possibility of detection on endpoints. The historical focus of
+mature teams was primarily on the network layer. While the network
+layer is still important as an objective point of observation the
+application layer has received more attention. The reason for it
+is the acceptance that often it is were exploitation happens and
+the capabilities as commercial products has emerged.
+With that in mind a shift in the understanding of a best practice
+of positioning reverse proxies has occured as well. While the
+previous recommendation was to think: defend inside-out. The focus
+is now to defend outside-in.
+The meaning of defending outside-in, is to take control of what
+can be controlled: the application infrastructure. In all
+practicality this means to position the reverse proxy in front of
+your server segment instead of the whole network, including
+ [ Application A ]
+ [ Client on-prem ] |
+ ] ---> [ Reverse proxy ] ---> [ App gateway ]
+ [ Client abroad ] ^ |
+ risk assessment [ Application B ]
+Previously, by some reason, we put the "client on-prem" on the
+other side of the reverse proxy, because we believed we could
+control what the user was doing. Today, we know better. This is
+not a trust issue, it is a matter of prioritizing based on the
+asset value and the defending capacity.
+A reverse proxy is also a central vantage point of your
+infrastructure. In a nutshell if you are good detecting security
+incidents at this point, you are in a good position to have
+freedom of movement - such as channeling your opponent.
+The modern reverse proxy have two integration capabilitites that
+legacy proxies do not:
+* Single sign-on (SSO), which provides strong authentication and
+ good identity management
+* Access control logic (Google calls this the access control
+ engine)
+In fact, Google in 2013 stated it uses 120 variables for a risk
+assessment in its access control logic for Gmail [6]. In
+comparison most organisations today use three: username, password
+and in half the instances a token.
+> Every time you sign in to Google, whether via your web browser
+> once a month or an email program that checks for new mail every
+> five minutes, our system performs a complex risk analysis to
+> determine how likely it is that the sign-in really comes from
+> you. In fact, there are more than 120 variables that can factor
+> into how a decision is made.
+I imagine that Google uses the following factors for comparison to
+the sole username/password approach (they state some of these in
+their article):
+- Geo-location with an algoritmic score of destination of last
+ login to current location was part of this. The k-means distance
+ could be a good fit.
+- Source ASN risk score
+- Asset subject to access
+- User role scored against asset subject to access
+- Device state (updated, antivirus installed and so on)
+- Previous usage patterns, like time of day
+- Other information about the behavioural patterns of relevant threats
+Another nice feature of a reverse proxy setup this way is that it
+minimizes the exposure and gives defenders the possibility to
+route traffic the way they see fit. For instance, it would be hard
+for an attacker to differentiate between a honeypot and a
+production system in the first place. One could also challenge the
+user in cases where in doubt, instead of plainly denying access as
+is sometimes done.
+One challenge is what protocols need support. The two clear ones
+* SSH
+* Application gateways between micro-segments
+I have scoped out the details of micro-segmentation from this
+post. Micro-segmentation is the basic idea of creating a fine mesh
+of network segments in the infrastructure so that no asset can
+communicate with another by default. The rest is then routed
+through e.g. a gateway such as Pomerium, or in high-performance
+cases an application gateway - which may be a gateway for a
+specific binary protocol. The reason is control of all activity
+between services, being able to shape and deny access in the
+Even though this post is not about implementation I will leave you
+with some examples of good open source starting points: Pomerium
+is an reverse proxy with the SSO-capability, and the default
+capabilities of SSH takes you far (ssh-ca and JumpHost).
+ -----------> [ syslog server ] <------------
+ | | |
+ | | |
+ o | | |
+ /|\ [ Client ] -------> [ example.com ] <-----> [ app001.example.com ]
+ / \ | https - pomerium |
+ | | - SSH JumpHost |
+ | | |
+ | | |
+ [ HIDS ] |-------------------> [ NIDS ]
+ Figure 1: Conceptual Defendable Infrastructure Overview
+Now that a checkpoint is establish in front of the infrastructure,
+the rest is a matter of traceability, taking the time to
+understand the data to gain insight and finally develop and
+implement tactics against your opponents.
+Until next time.
+[1] https://cloud.google.com/beyondcorp
+[4] Tao of Network Security Monitoring, The: Beyond Intrusion
+[5] Extrusion Detection: Security Monitoring for Internal
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..6c636ad
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,77 @@
+ "nodes": {
+ "cl-nix-lite": {
+ "locked": {
+ "lastModified": 1721009305,
+ "narHash": "sha256-GtVd8VmPZB+J64VCf26yLbFUFRT1mdpzC8ylAHMIJoo=",
+ "owner": "hraban",
+ "repo": "cl-nix-lite",
+ "rev": "dc2793ec716b294739dabd6d99cc61543e6cd149",
+ "type": "github"
+ },
+ "original": {
+ "owner": "hraban",
+ "repo": "cl-nix-lite",
+ "type": "github"
+ }
+ },
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1710146030,
+ "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1722791413,
+ "narHash": "sha256-rCTrlCWvHzMCNcKxPE3Z/mMK2gDZ+BvvpEVyRM4tKmU=",
+ "owner": "NixOS",
+ "repo": "nixpkgs",
+ "rev": "8b5b6723aca5a51edf075936439d9cd3947b7b2c",
+ "type": "github"
+ },
+ "original": {
+ "owner": "NixOS",
+ "ref": "nixos-24.05",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "cl-nix-lite": "cl-nix-lite",
+ "flake-utils": "flake-utils",
+ "nixpkgs": "nixpkgs"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
diff --git a/flake.nix b/flake.nix
index ebe348c..0a099a0 100644
--- a/flake.nix
+++ b/flake.nix
@@ -13,17 +13,57 @@
pkgs = nixpkgs.legacyPackages.${system}.extend cl-nix-lite.overlays.default;
- defaultPackage.x86_64-linux =
- # Notice the reference to nixpkgs here.
- with import nixpkgs { system = "x86_64-linux"; };
- stdenv.mkDerivation {
- name = "hello";
- src = self;
- buildPhase = "gcc -o hello ./hello.c";
- installPhase = "mkdir -p $out/bin; install -t $out/bin hello";
+ packages = {
+ ecl = with pkgs.lispPackagesLiteFor pkgs.ecl; lispDerivation {
+ name = "thoughts";
+ lispSystem = "thoughts";
+ lispDependencies = [
+ asdf
+ arrow-macros
+ ];
+ src = pkgs.lib.cleanSource ./generator.lisp;
+ meta = {
+ license = pkgs.lib.licenses.agpl3Only;
+ buildInputs = [
+ pkgs.ecl
+ pkgs.git
+ pkgs.gnumake
+ pkgs.asdf
+ pkgs.multimarkdown
+ ];
+ phases = [ "unpackPhase" "installPhase" "cleanupPhase" ];
+ unpackPhase = ''
+ mkdir -p $TMPDIR
+ cp ${./generator.lisp} $TMPDIR/generator.lisp
+ mkdir -p $TMPDIR/data
+ cp -r ${toString ./data}/* $TMPDIR/data/
+ mkdir -p $TMPDIR/templates
+ cp -r ${toString ./templates}/* $TMPDIR/templates/
+ mkdir -p $TMPDIR/static
+ cp -r ${toString ./static}/* $TMPDIR/static/
+ '';
+ installPhase = ''
+ mkdir -p $out/html
+ mkdir -p $out/gemini
+ mkdir -p $TMPDIR/output/gemini/articles
+ mkdir -p $TMPDIR/output/html
+ mkdir -p $TMPDIR/temp/data
+ cd $TMPDIR
+ ecl --load $TMPDIR/generator.lisp
+ cp -r $TMPDIR/output/html/* $out/html/
+ cp -r $TMPDIR/output/gemini/* $out/gemini/
+ cp -r $TMPDIR $out/tmpdir
+ '';
+ cleanupPhase = ''
+ rm -rf $TMPDIR/temp
+ '';
+ };
devShell = pkgs.mkShell {