diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bad294 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +drafts/* \ No newline at end of file diff --git a/data/an-openioc-graph-a-different-kind-of-rule-scheme.md b/data/an-openioc-graph-a-different-kind-of-rule-scheme.md new file mode 100644 index 0000000..8034621 --- /dev/null +++ b/data/an-openioc-graph-a-different-kind-of-rule-scheme.md @@ -0,0 +1,240 @@ +Despite that I think that implementing a full-fledged +XML-editor is too complex for an operational scenario, I +believe the OpenIOC-format, which has been in the works at +Mandiant for a couple of years now, is quite good. They also +have the IOC Writer which was launched at last summers Black +Hat. OpenIOC can export to other expression languages, such +as Yara [1], as well. + +I have been thinking of a way to combine graph knowledge +with exactly that for a while, an expressive detection +language based on a graph. If combining two things you love, +I have learned that it simply can't end badly, it must end +with something amazing. Let's give it a try! + +So I went about it, starting off by importing a sample +Maltego-graph to Titan on HBase [2]. I basically set out +with five connected nodes in Maltego Tungsten. Nothing +malicious, just a national newspaper. + +Running that through my Rexster migration script results in +a equivalent graph on the Rexster server. + +It's nice considering if you'd like to put it in a larger +context with millions or billions of vertices you would like +to trigger on. That is out of bounds for Maltego, or your +desktop system in general. + + +## The OpenIOC Part + +If looking at the graphs above, you will probably agree that +it isn't especially describing of certain incidents or other +contextual data. But what if we could combine the graph with +something like OpenIOC? Turns out that it's conceptually +similar. The weakness of OpenIOC is that it doesn't scale +when firing up an OpenIOC editor - like the one Mandiant +have created. On the other hand, if you could traverse a +graph with OpenIOC designed around the OpenIOC format.. + +Let's create a basic writer as a demonstration, which +operates on the root level (no nesting of rules in this +example). + + from ioc_writer import ioc_api + from lxml import etree as et + + class IOC: + def __init__(self): + self.IOC = ioc_api.IOC(name='Test', description='An IOC generated from a Python script', author='Someone') + + self.IOC.set_created_date() + self.IOC.set_published_date() + self.IOC.set_lastmodified_date() + self.IOC.update_name('test_rexster') + self.IOC.update_description('A Test') + self.id = self.IOC.iocid + + def addNode(self,label,text,type,indicator,condition='is'): + IndicatorItem_node = ioc_api.make_IndicatorItem_node(condition, label, text, type, indicator) + current_guid = IndicatorItem_node.attrib['id'] + print current_guid + self.IOC.top_level_indicator.append(IndicatorItem_node) + + def __str__(self): + self.xml = et.tostring(self.IOC.root, encoding='utf-8', xml_declaration=True, pretty_print=True) + return self.xml + +This enables us to do something like this: + + ioc = IOC() + ioc.addNode('test','Just a test','domain','vg.no') + print ioc + +Which will again return the XML of the IOC. + + + + + test + A Test + + Someone + 2014-01-28T07:15:09 + + + + + + + vg.no + + + 195.88.55.16 + + + + + + + +Reviewing the XML above you might notice that the scheme is +pretty transferrable to a graph, perhaps even simplifying of +the IOC XML. Be especially aware on the following tags and +attributes: + +* Content +* The IndicatorItem condition +* The content type + +A nested IOC might look like this (relevant excerpt): + + + + + vg.no + + + 195.88.55.16 + + + + +The above implies that the domain vg.no needs to be +accompanied with the IP-address ``195.88.55.16``. + +## Merging the Best of Two Worlds + +So now that we have had a look at the power in the structure +of a graph and the power of expression in the OpenIOC +XML-indicators, you might see why this is the best of two +worlds. + +In the challenge of combining them both I perhaps +oversimplified the nesting and used the two previously +mentioned attributes in the graph, adding the content as the +value of the node and the condition. We will also have to +add the type attribute since that tells us what type of +OpenIOC entry we have when reversing the process later +on. We will have a small collision between Maltego and +OpenIOC, since for instance an IP-address type will +differ. So for now you will need two type attributes, one +for Maltego and one for OpenIOC (if you plan to go both +ways). This is left as an exersise for the reader. + +Creating an OpenIOC-compatible graph is a breeze: + + from rexpro import RexProConnection + + class Graph: + def __init__(self): + self.graph = RexProConnection('localhost',8184,'titan') + + def addVertice(self,content,content_type,condition): + vertice_id = self.graph.execute(""" + def v1 = g.addVertex([content:content,content_type:content_type,condition:condition]) + return v1""", + {'content':content, 'content_type':content_type, 'condition':condition}) + return vertice_id + + def addEdge(self,vid1,vid2,label): + edge = self.graph.execute(""" + def v1 = g.v(vid1) + def v2 = g.v(vid2) + g.addEdge(v1, v2, label) + g.commit()""",{'vid1':vid1['_id'], 'vid2':vid2['_id'], 'label':label}) + + graph=Graph() + v1=graph.addVertice('vg.no','domain','is') + v2=graph.addVertice('195.88.55.16','ip','is') + graph.addEdge(v1,v2,'and') + +If you'd like to go the other way again in order to talk to +other organisations perhaps, you will want to run the +process in reverse: + + from rexpro import RexProConnection + + class RexsterIOC: + def __init__(self): + self.graph = RexProConnection('localhost',8184,'titan') + + self.IOC = ioc_api.IOC(name='Test', description='A test IOC generated from Rexster', author='Someone') + + self.IOC.set_created_date() + self.IOC.set_published_date() + self.IOC.set_lastmodified_date() + #IOC.add_link('help', self.baseurl + url) + self.IOC.update_name('test') + self.IOC.update_description('A Test') + self.id = self.IOC.iocid + self.lastId=None + + def addNode(self,label,text,type,indicator,condition='is',addToLast=False): + IndicatorItem_node = ioc_api.make_IndicatorItem_node(condition, label, text, type, indicator) + + if addToLast and self.last: + self.last.append(IndicatorItem_node) + else: + self.IOC.top_level_indicator.append(IndicatorItem_node) + + current_guid = IndicatorItem_node.attrib['id'] + self.last = IndicatorItem_node + + def traverse(self,rootNodeId): + root=self.graph.execute("""return g.v(80284)""",{'vid':str(rootNodeId)}) + self.addNode('test','Just a test', + root['_properties']['content_type'], + root['_properties']['content'], + root['_properties']['condition']) + + one_level_out=self.graph.execute("""return g.v(vid).out""",{'vid':str(rootNodeId)}) + for vertex in one_level_out: + self.addNode('test','Just a test', + vertex['_properties']['content_type'], + vertex['_properties']['content'], + vertex['_properties']['condition'],addToLast=True) + + def __str__(self): + self.xml = et.tostring(self.IOC.root, encoding='utf-8', xml_declaration=True, pretty_print=True) + return self.xml + + ioc = RexsterIOC() + ioc.traverse(80284) # the root node + print ioc + +One thing that you can now do is to store the indicators +with the rest of your network data. This again will imply +that the edges are created automatically without any need to +actually run jobs to combine data for detecting stuff. + +That's my small concept demonstration. I think it's pretty +cool! + +I've put the scripts in a Gist for you if you'd like to give +it a try [3]. + + +[1] Yara: https://github.com/mandiant/ioc_writer/tree/master/examples/openioc_to_yara +[2] Importing a sample Maltego-graph to Titan on HBase: https://gist.github.com/tommyskg/8166472 +[3] the scripts out there: https://gist.github.com/tommyskg/8671318 diff --git a/data/apm-lock.md b/data/apm-lock.md new file mode 100644 index 0000000..2f651d0 --- /dev/null +++ b/data/apm-lock.md @@ -0,0 +1,56 @@ +I have used OpenBSD for some time now and one of the things that I +have had to work a bit on to get the way I like it, is locking the +terminal upon apmd suspend. In other words locking the terminals +when I close the lid. + +Since it is a bit of code and that I reuse it other places, I +created this as a separate helper script. Thus, my +``/etc/apm/suspend``-reference is: + +``` +#!/bin/ksh + +lock.sh& +sleep 3 +``` + +The suspend file executes every time the lid is closed. + +Once upon a time I probably used different sources for this, but +anyways the script that I currently use are two-fold. The first +part locks all xenodm sessions with xlock: + +``` +CMD_LOCK="xlock" + +# get all currently running xenodm sessions +XSESSION=$(ps -axo user,ppid,args|awk '/xenodm\/Xsession/ { print +$1,$2}') + +# lock all logged in X sessions +for SESSION in "$XSESSION"; do + _USER=$(echo $SESSION | cut -f1 -d' ') + _PPID=$(echo $SESSION | cut -f2 -d' ') + _DISPLAY=$(ps -p $_PPID -o args=|cut -d' ' -f2) + su - $_USER -c "export DISPLAY=\"$_DISPLAY\" && $CMD_LOCK" & +done +``` + +The second part of the script kills all active consoles. This is +the most important part for me, since I most often lock the +screen, but forget to log off the consoles. + +``` +# kill open console TTYs +OPEN_TTYS=$(who|awk '{print $2}'|fgrep ttyC) +for _TTY in $OPEN_TTYS; do + T=$(echo $_TTY|sed 's/tty//'); + TTY_PID=$(ps -t $T|fgrep -v COMMAND|fgrep "ksh (ksh)"|awk '{print $1}'); + kill -9 $TTY_PID; +done +``` + +Please also be aware that suspending the laptop will leave things +in plaintext, in memory, so to truly be resistant to an evil maid +vector you would need to power off the laptop when out of a +controlled area. diff --git a/data/avenger-openbsd68.md b/data/avenger-openbsd68.md new file mode 100644 index 0000000..82a0596 --- /dev/null +++ b/data/avenger-openbsd68.md @@ -0,0 +1,27 @@ +Those following me on the Fediverse has recently become familiar +with an old-school program called Mail Avenger. + + +``` +mkdir ~/.avenger +openssl rand -base64 8 | shasum | head -c16 > ~/.avenger/.macpass +echo "" >> ~/.avenger/.macpass +``` + + + +``` +brew install berkeley-db4 +curl -O http://www.mailavenger.org/dist/avenger-0.8.5.tar.gz +echo "b0fc3e2e03ed010e95e561367fce7b087968df7ea6056251eba95cad14d26d37 avenger-0.8.5.tar.gz" | shasum -a 256 --check +tar xvzf avenger-0.8.5.tar.gz +cd avenger-0.8.5 +./configure --with-db=/usr/local/Cellar/berkeley-db@4/4.8.30 +cd util +make macutil && install macutil ~/.local/bin/ +``` + +``` +macutil --expire=+2M --from "Tommy S" --fromexp "address expires" --sender "t+return+*@252.no" +``` + diff --git a/data/cognitive-automation.md b/data/cognitive-automation.md new file mode 100644 index 0000000..0c8b70a --- /dev/null +++ b/data/cognitive-automation.md @@ -0,0 +1,105 @@ +There is a lot of hype around many things in cyber +security. One concept that is not, is called Cognitive +Automation (CA). CA can be explained by comparing it to +traditional automation. That is, how tasks are automated: +like alerts correlation. By using cognitive automation, the +way the mind works is taken into account. I believe many +security professionals will recognise the practical aspects +of Schulte's model for "Complexity of automation vs +effectiveness/safety" [1]. + +I've written a post on this topic years ago ("The Role of +Cognitive Automation in Information Security"), but +unluckily that was lost in migration. It probably needed an +update anyways, and I believe the cyber security field is +more mature to receive this input now rather than at that +point. + +Cognitive automation is strongly applied in the aerospace +industry for instance. In aerospace, long ago, there was a +realisation that the strengths of thee human-being is the +ability to learn, instinct, problem reduction, ability of +abstraction and several others. The machine’s strength is +parallel processing, objectivity, long-term monitoring, +complex planning and decision making and so on. Schulte +describes this concept in detail, in Man-Machine Cooperation +model [1]. + +In order to benefit from a similar model in cyber security +there is a need to evolve the way data is extracted, +preprocessed and prepared for human-machine interaction. As +may be recognised at this point there are already technology +available to provide parallel processing on the machine +part. How a computing cluster would solve such a problem is +the evident problem. In that regard, machine learning is the +most promising technique to structure and classify the data +which seems to scale really well. Efficiently ingesting, +storing and preprocessing the data is the first stage of +that challenge. + +Another detail that I would like to point out here, from the +great book "The Multitasking Mind" by Salvucci and Taatgen, +is how the human mind works with buffers (the aural, visual, +declarative, goal, manual and problem buffers). A human can +actually only handle one thing at once. So when analysts are +tasked with several simultaneous tasks or roles, this will +definitively produce bad quality results. This is really +important to understand to all cyber security seniors and +designers, so read the book. + +Back to how this applies in practical terms: when analysts +manually analyse and decide by expert knowledge, classifying +the attributes of full content data and e.g. creates Yara +and Snort signatures, it is a reasonable assumption that a +number of relevant attributes are never evaluated as +potential anomalies. This greatly increases the +possibilities of the threat groups. In aerospace cognitive +automation there is a concept called Mission Management, +that is similar to the problem described here. + +Now for a practical example of how cognitive automation can +work, this time paralleled with the approach taken by +Netflix to movie recommenders. Let's say that you have +stored the PDFiD [2] vector of all PDF documents over the +last ten years, passing through a network. The vector +structure will look like: + +``` +obj,endobj,stream,endstream,xref,trailer,startxref,/Page,/Encrypt,/JS,/JavaScript,/AA,/OpenAction,/JBIG2Decode +``` + +or: + +``` +1. 7,7,1,1,1,1,1,1,0,1,1,0,1,0 +[...] +``` + +If 500 PDF files passes through the systems each day on +average, that will be 1825' documents over those ten +years. In addition qtime is a significant part of that +vector - and other parameters could be file names and so on. + +If an analyst receives a suspicious PDF file. That file may +initially hard to classify by the analyst. In such a case +the system should propose other related files to look +at. Practically speaking this saves the analyst cognitive +capacity to use instict, pattern recognition and creativity +to classify the document. The machine on the other hand +maintains objectivity, has great stress resistance, can +retrieve a lot more information, and it can process and +pivot on all those 10 years of documents as opposed to the +analyst. + +Now that you have gotten an introduction to the world of +cognitive automation, I hope this will drive a discussion on +how we can take our field to the next level. I am confident +that this means understanding and solving problems before +attempting to buy our way out of them. + + +[1] Schulte, D. A. 2002. Mission management and crew assistance for military aircraft: cognitive concepts and prototype evaluation. +[2] PDFiD: https://blog.didierstevens.com/2009/03/31/pdfid/ + + + diff --git a/data/converting-pst.md b/data/converting-pst.md new file mode 100644 index 0000000..5f109d8 --- /dev/null +++ b/data/converting-pst.md @@ -0,0 +1,100 @@ +Some time ago I gave an introduction to converting Microsoft +MSG files [1] to a readable RFC 2822 [2] format on Linux. In +fact you will sometimes get an even kinkier format to work +with: The Outlook Data File (PST) [3]. PST files is a +proprietary format used by Microsoft Outlook, and is the +equivalent of the mbox on Linux. + +**Edit August 29th**: Also have a look at the more +up-to-date [4]. + +Even though PST files are a bit harder to read than single +EML files, there is hope if you only have a Linux client: +libpst, and more specifically readpst. For libpst you need +three libraries: + +* ``libgsf`` (i/o library that can read and write common file +types and handle structured formats that provide +file-system-in-a-file semantics) +* boost (portable C++ source libraries) +* libpst + +On OS X you can install it by: + +``` +brew install libgsf +brew install boost +brew install libpst +``` + +Now if you have a pst archive, like [5] for instance, you can +convert it by: + + mkdir export + readpst -M -b -e -o export "Personal Folders.pst" + +This should give an output like this: + + Opening PST file and indexes... + Processing Folder "Deleted Items" + Processing Folder "Inbox" + Processing Folder "latest" + [...] + Processing Folder "Reports" + "Reports" - 11 items done, 1 items skipped. + Processing Folder "Quotes" + "Quotes" - 1 items done, 1 items skipped. + Processing Folder "Printer" + "Printer" - 1 items done, 1 items skipped. + Processing Folder "Passwords" + "Passwords" - 6 items done, 1 items skipped. + [...] + Processing Folder "Kum Team" + "Kum Team" - 37 items done, 0 items skipped. + "9NT1425(India 11.0)" - 228 items done, 1 items skipped. + Processing Folder "Jimmi" + "Jimmi" - 31 items done, 0 items skipped. + "Inbox" - 27 items done, 11 items skipped. + Processing Folder "Outbox" + Processing Folder "Sent Items" + "Sent Items" - 0 items done, 1 items skipped. + Processing Folder "Calendar" + "Calendar" - 0 items done, 6 items skipped. + Processing Folder "Contacts" + "Contacts" - 0 items done, 1 items skipped. + [...] + Processing Folder "Drafts" + Processing Folder "RSS Feeds" + Processing Folder "Junk E-mail" + Processing Folder "quarantine" + "My Personal Folder" - 13 items done, 0 items skipped. + +Which creates a directory structure like ``ls -l 'export/My +Personal Folder'``: + + drwxr-xr-x 2 - staff 68 Aug 28 21:34 Calendar + drwxr-xr-x 2 - staff 68 Aug 28 21:34 Contacts + drwxr-xr-x 29 - staff 986 Aug 28 21:34 Inbox + drwxr-xr-x 2 - staff 68 Aug 28 21:34 Journal + drwxr-xr-x 2 - staff 68 Aug 28 21:34 Sent Items + drwxr-xr-x 2 - staff 68 Aug 28 21:34 Tasks + +If you sample ``Inbox/Mails/``, you will find: + + 1.eml 10.eml 11.eml 12.eml 13.eml 14.eml 15.eml 16.eml 17.eml 2.eml 3.eml 4.eml 5.eml 6.eml 7.eml 8.eml 9.eml + +You can now continue with our previous post [6]. I'll also +encourage you to have a look at the documentation of the +Outlook PST format [7]. + + +[1] Converting Microsoft MSG files: /2013-10-08-msg-eml.html +[2] RFC 2822: http://tools.ietf.org/html/rfc2822 +[3] The Outlook Data File (PST): http://office.microsoft.com/en-001/outlook-help/introduction-to-outlook-data-files-pst-and-ost-HA010354876.aspx +[4] libpff: /converting-pst-archives-in-os-xlinux-with-libpff +[5] Example PST file: http://sourceforge.net/projects/pstfileup/files/Personal%20Folders.pst/download +[6] Reading MSG and EML Files on OSX/Linux Command Line: :4443/forensics/reading-msg-files-in-linux-command-line/ +[7] The outlook.pst format: http://www.five-ten-sg.com/libpst/rn01re05.html + + + diff --git a/data/gpg-openssl.md b/data/gpg-openssl.md new file mode 100644 index 0000000..0966397 --- /dev/null +++ b/data/gpg-openssl.md @@ -0,0 +1,126 @@ +## Key Takeaways + +* PGP are replaceable with native OpenSSL RSA public key crypto + and AES-256 keys. +* This approach simplifies crypto operations, and only requires + OpenSSL which is widely available. +* Existing PGP keys stored in GnuPG work with OpenSSL via `gpgsm`. + +## Introduction + +The rabbit hole mission of mine to get rid of PGP continues. + +Lately I have been looking into converting PGP keys from GnuPG to +OpenSSL. This way I can send encrypted data to people not using my +OpenSSL-only approach. After all, most people still depend on PGP +and it is the format they publish their public keys in. + +## Exporting A PGP Public Key for Encryption Using OpenSSL + +A PGP key cannot be directly read by OpenSSL, but GPG can natively +export to SSH and ssh-keygen to PKCS8: + +``` +gpg --export-ssh-key ! > /tmp/test.pub +ssh-keygen -f /tmp/test.pub -e -m PKCS8 > /tmp/test.pem +``` + +The above pubkey can be used to encrypt data with OpenSSL as shown +on my [contact page](https://contact.252.no): + +``` +KEY=`openssl rand -hex 32` IV=`openssl rand -hex 16` +ENCRYPTED_KEY_B64=`openssl pkeyutl -encrypt -pubin -inkey /tmp/test.pem -pkeyopt rsa_padding_mode:oaep <<< $KEY|base64` +BLOB=`openssl enc -aes-256-cfb -a -e -K ${KEY} -iv ${IV} -in some-file` +echo "PKCS11-VAULT;aes-256-cfb;rsa_padding_mode:oaep;$ENCRYPTED_KEY_B64:$IV:$BLOB;" > encrypted.txt +``` + +The steps of the above are: + +1. Create an initialization vector [1] and an encryption key +2. Encrypt the one-time key to test.pem (our exported PGP-key) +3. Encrypt `some-file` using the key and IV using 256 bits AES in CFB-mode +4. Format the output in my PV-format. + +Store `encrypted.txt` for decryption in the next section. + +## Exporting a PGP Private Key for Decryption Using OpenSSL + +This part is a bit more complex. For the sake of an example, let +us say you received an encrypted blob with an IV and encrypted +key, using the approach shown in the former section. You have the +key stored in GnuPG. + +`gpgsm` can export your private key to p12, which is readable for +OpenSSL [2]. + +First list your secret keys in the GnuPG store: `gpg +--list-secret-keys --with-keygrip`. + +Convert the key to X.509 by: `gpgsm --gen-key -o +/tmp/temp.crt`. You need to fill the values requested: + +* Select "existing key" +* Fill the keygrip from the GPG secret key listing. Make sure you + use the right key, since GPG generates several keys behind the + scenes (the encryption key) +* Fill the cn (this needs to be on the format "cn=...") and e-mail +* Accept the other values as empty and accept the creation + +Now import the certificate into `gpgsm`: `gpgsm --import +/tmp/temp.crt`. When imported, find the key ID by: `gpgsm +--list-keys`. + +Using the key ID, you can now export the key in p12-format. + +``` +gpgsm -o /tmp/$keyid.p12 --export-secret-key-p12 $keyid +openssl pkcs12 -in /tmp/$key.p12 -nodes -nocerts|tail -n +5 > /tmp/$key.key +``` + +You only need to do the conversion once and now have your key in +`/tmp/$key.key`. This should be secured accordingly, and have a +password set as is offered in the guidance by gpgsm. + +The resulting `/tmp/$key.key` is usable for decrypting content +encrypted by the public key. To decrypt the data in `encrypted.txt`: + +``` +IFS=';' read IDENTIFIER ALGORITHM PADDING_MODE ENCRYPTION_BLOBS SIGNATURE < encrypted.txt + +for BLOB in ${ENCRYPTION_BLOBS[@]}; do + IFS=':' read ENCRYPTED_KEY_B64 IV TEXTFILE_ENC <<< $BLOB + ENCRYPTED_KEY=`printf $ENCRYPTED_KEY_B64 | base64 -d` + decrypted=false + DECRYPTED_KEY=`echo $ENCRYPTED_KEY_B64 |base64 -d | openssl pkeyutl -decrypt -inkey /tmp/$key.key -pkeyopt ${PADDING_MODE} 2> /dev/null` && decrypted=true + if [ $decrypted != false ]; then + TEXTFILE_DEC=`printf %s "$TEXTFILE_ENC"|base64 -d|openssl enc -$ALGORITHM -d -K "$DECRYPTED_KEY" -iv "$IV" |base64` + break + fi +done + +echo $TEXTFILE_DEC +``` + +The above format supports encryption to multiple parties. It: + +1. Reads the PV-format into variables +2. Loops through the encryption blobs (one pass if one recipient) +3. Decrypts the key with the private key generated from `gpgsm` +4. Using the IV and decrypted key, decrypts the content, which is + eventually the same as in the previous section's `some-file` +5. Prints the decrypted content + +## Conclusion + +It is possible to convert PGP keys to use with OpenSSL via `gpgsm`. + +Since OpenSSL is more widely distributed and installed than GnuPG, +it is a method applicable in more environments. + +Using OpenSSL instead of GnuPG provides more options, and reduces +the complexity of cryptography (since GnuPG has lots of options). + +[1] https://stackoverflow.com/questions/39412760/what-is-an-openssl-iv-and-why-do-i-need-a-key-and-an-iv + +[2] https://superuser.com/a/1414277 diff --git a/data/graph-experiment.md b/data/graph-experiment.md new file mode 100644 index 0000000..af64f5f --- /dev/null +++ b/data/graph-experiment.md @@ -0,0 +1,103 @@ +I currently maintain this threat database, and up until now I've +generated the graph data for d3 using queries, and a lot of logic, +in a MySQL-database. That is going to change pretty soon. You +might also remember when we did Social Network Analysis and Object +Attribution with Maltego 3 [1]. + +In my seeking for understanding the Apache Hadoop ecosystem I all +of a sudden got a brutal meeting with Java (Eclipse huh..). I also +discovered that there are a world of libraries and applications +previously unknown to me. One of them is the über-awesome Neo4j, +which is a graph database originally built for Java - but guess +what: It's got a REST API as well. As usual you don't have to +write the Python code yourself, someone already wrote it for +you. Note that it only does Python 2 for now [2,3]. + +The coolest thing about Neo4j is Cypher [5]: Cypher is a "graph +query language" as they put it themselves. With Cypher you can +express what you look for in an entirely other way than you would +do in a relational database, it's actually easy. + +And: You of course need the database running as well. If you use a +Debian system like me your in luck since they have an experimental +version out there [5]. + +Enough talk, here is a simple example of how you could go about it +in regard to scripting the relations considering threat +intelligence in order to connect groups to incidents. The goal +would be to find peripherally connected groups. + + from GraphConn.Connect import Graph + g = Graph() + + # create groups + g.cGroup("ThreatA") + g.cGroup("ThreatB") + g.cGroup("ThreatC") + + # create incidents + g.cIncident("IncA") + g.cIncident("IncB") + g.cIncident("IncC") + + # relate groups in some way to each other through incidents + g.link("ThreatA","IncA") + g.link("ThreatA","IncB") + g.link("ThreatB","IncC") + g.link("ThreatC","IncA") + g.link("ThreatB","IncB") + + # find all threats related to Threat A through incidents + print g.fRelated("ThreatA") + +You might find this simple, but if you've ever tried to do it in +SQL you know why you'll need it. Also, remember that this scales +indefinite to other entity types as well. + +Here's the class used to generate the graph, for reference (feel +free to copy it, produce something cool and post it back in the +comment field): + + from neo4jrestclient import client + from neo4jrestclient.client import GraphDatabase + from neo4jrestclient.query import Q + + class Graph: + def __init__(self): + self.gdb = GraphDatabase("http://localhost:7474/db/data/") + self.nodes = [] + + def cGroup(self,name): + n = self.gdb.nodes.create(name=name, type='Group') + self.nodes.append(n) + + def cIncident(self,name): + n = self.gdb.nodes.create(name=name, type='Incident') + self.nodes.append(n) + + def link(self,n1,n2): + try: + l = (Q("name", iexact=n1)); n1 = self.gdb.nodes.filter(l)[0]; + l = (Q("name", iexact=n2)); n2 = self.gdb.nodes.filter(l)[0]; + return n1.relationships.create("Executed", n2) + except: + return False + + def fRelated(self,query): + l = (Q("name", iexact=query)) + n = self.gdb.nodes.filter(l)[0] + r = n.traverse() + for n2 in r: + for e in n2.traverse(): + r.append(e) + return list(r) + +I really hope you enjoy this as much as me right now. The Facebook +Graph Search for the rest of us. + + +[1] gopher://secdiary.com/0/post/sna-oa-maltego/index.txt +[2] https://pypi.python.org/pypi/neo4jrestclient/ +[3] https://neo4j-rest-client.readthedocs.org/en/latest/elements.html +[4] http://www.neo4j.org/learn/cypher +[5] http://debian.neo4j.org/ diff --git a/data/graphs-scale.md b/data/graphs-scale.md new file mode 100644 index 0000000..2dbc035 --- /dev/null +++ b/data/graphs-scale.md @@ -0,0 +1,82 @@ +Following up on my post yesterday, I have also been looking at +graphs the other way - from a scalable database to a manageable +graph involving e.g. just one segment. + +There are currently two ways to do this: + +1) Export the graph, and 2) streaming the graph from and to the +graph database. The first option is obviously the simple one, but +doesn't always make up for our needs. The latter option is often +the case when you work multiple analysts at the same graph. + + +## Option 1: Exporting the Graph + +To achieve the first you can use the GraphML save function of +Gremlin. + + conf = new BaseConfiguration(); + conf.setProperty("storage.backend","hbase"); + conf.setProperty("storage.hostname","sandbox.hortonworks.com"); + conf.setProperty("storage.port","2181"); + g = TitanFactory.open(conf); + g.saveGraphML('test.graphml') + +This graph can again be opened in tools such as Gephi. + +You can also use the Gephi database API plugin for +Rexster. There's a Blueprints repo [1] which extends that. Short +how-to on how to get going with the Gephi development environment, +from the wiki-pages of the plugin [2]: + +1. Get plugins from [3], and [4] +2. Open Gephi, go to ``Tools > Plugins > Downloaded > "Add + Plugins..."`` +3. Press install and follow the guidance, at the end you should + restart Gephi +4. Go to File > Import Database +5. Add the Rexster configuration to ``/etc/graph/rexster.xml`` (if + when importing the database issues arises, look at [5] + +``rexster.xml`` should look like this: + + + RexterGraph + com.tinkerpop.rexster.config.RexsterGraphGraphConfiguration + 100 + http://192.168.109.128:8182/graphs/titan + + +You should be left with something like this for instance in Gephi: + +![A Rexster Graph Import to Gephi, from a Titan database. The graph consists of a variety of segments, such as articles from a article-system and imported Maltego graphs](/static/img/data/rexster-import-gephi.png) + +A Rexster Graph Import to Gephi, from a Titan database. The graph +consists of a variety of segments, such as articles, imported +Maltego graphs and such. + +A Rexster Graph Import to Gephi, from a Titan database. The graph +consists of a variety of segments, such as articles from a +article-system and imported Maltego graphs + +Here's the cluster on the right there by the way. There's some +interesting patterns inside there it seems, so I suspect it's from +a Maltego graph: + +![](/static/img/data/gephi-cluster-maltego.png) + +## Option 2: The Gephi Streaming API + +For the other option I found the Gephi graph streaming API +[6]. This one I currently found a little limited in that it can +only provide collaboration between two Gephi instances using a +Jetty web-server. It's pretty cool, but doesn't offer the +integration I am looking for. I'll get back to this later. + +[1] https://github.com/datablend/gephi-blueprints-plugin +[2] https://github.com/datablend/gephi-blueprints-plugin/wiki +[3] https://github.com/downloads/datablend/gephi-blueprints-plugin/org-gephi-lib-blueprints.nbm +[4] +https://github.com/downloads/datablend/gephi-blueprints-plugin/org-gephi-blueprints-plugin.nbm +[5] https://github.com/datablend/gephi-blueprints-plugin/issues/1 +[6] https://marketplace.gephi.org/plugin/graph-streaming/ diff --git a/data/indicators.md b/data/indicators.md new file mode 100644 index 0000000..de851a8 --- /dev/null +++ b/data/indicators.md @@ -0,0 +1,463 @@ +Over what have become some years, cyber security +professionals have been working on optimising the sharing of +information and knowledge. A lot of the efforts have +recently been focused around intelligence- and data-driven +teams. Today many of these discussions have ended evolving +around something related to the STIX format. + +> Don't use a lot where a little will do +> – Unknown origin + +This post features a perspective of the potential of today's +standard-oriented approach for documenting indicator sets +related to cyber security threat actors and incidents. It +turns out we have a longer way to go than expected. + +For the purpose of this article, an indicator is a +characteristic or evidence of something unwanted, or hostile +if you'd like. I like to refer to the military term +"Indicators & Warnings" in this regard. In other words, an +indicator isn't necessarily limited to the cyber domain +alone either. Physical security could be in an even worse +condition than cyber security when it comes to expressing +threat indicators. I'll leave the cross-domain discussion +for another time. + +## Up Until Today + +Multiple standards have evolved and disappeared, and one +that I have been in favor of previously is the OpenIOC 1.1 +standard. However, times are changing, and so are the +terminology and breadth of how we are able to express the +intrusion sets. + +Even though OpenIOC was a very good start, and still is as +far as I am concerned, it has far been surpassed Cybox and +ultimately STIX [1] in popularity. + +STIX is a container, a quite verbose XML format (which is +turning JSON in 2.0). Cybox is the artefact format [2], for +malware you have MAEC [3] and so on. Basically it's a set of +projects collaborating. + +This all sounds good, right? Not quite. Have a look at the +OpenIOC to STIX repository on Github [4] and you will find +that ``stuxnet.stix.xml`` is 202 lines of XML code for 18 +atomic indicators. OpenIOC on the other hand, is 91 lines, +and that is a verbose format as well. In fact the overhead +ratio of the STIX file is about 10:1, while OpenIOC is about +5:1. + +To add to the mind-blowing inefficiency I have yet to see, +on a regular basis, complex and nested expressions of an +actor or a campaign in the STIX format. + +Before you continue, do a simple Google search for "STIX +editor" and "cybox editor". Do it now, and while you are at +it google for "openioc editor" as well. Hello guys, these +standards have been going around for many years. So, how +should we interpret that there aren't any user friendly +approaches to using them? The closest I've come is through +MISP, and that is generally speaking not using these +standards for their internal workings either. This one on +the MISP GitHub issue tracker says it all: STIX 2.x support +(MISP) [5]. + +I'm sure that some may disagree with the above statements, +calling out the infancy of these formats. However, they +can't be said to be new standards anymore. They are just too +complex. One example of such is the graph-oriented relations +implemented into the formats. Why not just let a graph +database take care of these instead? + +This is not just a post to establish the current state. How +would a better approach look? + +## What Is The Problem to Be Solved? + +Back to where things have gone since the OpenIOC 1.1/atomic +indicator days. The most promising addition, in my opinion, +is the MITRE PRE-ATT&CK and ATT&CK frameworks. The two +frameworks builds on a less structured approach than seen +for atomic indicators (Lockheed's Kill-Chain). The latter +can for instance be viewed in form of the Intelligence +Pyramid. + +The Intelligence Pyramid's abstraction levels can be mapped +against what it is supposed to support when it comes to +indicators like the following: + + | Level of abstraction | | Supports + |-----------------------|----|------------- + | Behavior | | Knowledge + |-----------------------|--->|------------- + | Derived | | Information + |-----------------------|--->|------------- + | Atomic | | Data + +The purpose of the abstration layer is in this case to +support assessments and measures at the corresponding +contextual level. For instance a technical report tailored +to an Incident Response Team (IRT) generally concerns +Derived and Atomic indicators, while an intelligence report +would usually be based on the Behavioural level. + +Having covered the abstraction layers, we can recognize that +OpenIOC (or Cybox and MAEC) covers the bottom layers of +abstration, while MITRE (PRE-)ATT&CK in its current form is +mostly about the Behaviour level. + +For Derived indicators there are primarily two +well-established, seasoned and successful formats that have +become standards through its widespread usage. This is +amongst others caused by the indicators and rules being +effective, rapid, easy and pleasing to write. + +First we have Snort/Suricata rules and Lua scripts which was +designed for network detection. For Snort/Suricata I'd say +that most of what is detected of metadata today is probably +expressable in OpenIOC (except for the magic that can be +done with Lua). Second there is the Yara format which has +become known for its applicability against malicious +files. The simplicity of both formats is obviously due to +their power of expression. Thus, I'd say that Yara and +Snort/Suricata formats is the ones to look for when it comes +to content and pattern detection. + +> Indicators should be easy and pleasing to write. + +To summarize the above, each of the formats can be mapped to +an abstraction level: + + | Level of abstraction | | Formats + |-----------------------|----|------------- + | Behavior | | MITRE (PRE-)ATT&CK + |-----------------------|--->|------------- + | Derived | | Suricata+Lua, Yara + |-----------------------|--->|------------- + | Atomic | | OpenIOC 1.1 + + +Going through my notes on how I document my own indicators I +also found that I use the CVE database, datetimes, +confidence, analyst comments for context and classification +as well (the latter being irrelevant for detection). + +One of the major problems is: everything that is currently +out there breaks the analyst workflow. You either need to +log in to some fancy web interface, edit XML files (god +forbid) or you would just jot down everything in a text +file. The text file seems to be the natural fallback in +almost any instance. I have even attempted to use the very +good initiative by Yahoo, PyIOCe, and Mandiant's +long-forgotten IOC Editor. These projects have both lost +tracktion, as almost every other intiative in this space. So +that is right folks, the text editor is still the preferred +tool in 2018, and let's face it: indicators should be +pleasing to design and create - like putting your signature +to an incident or a job well done. + +> an indicator set should be for humans and machines by + humans + +After all, the human is the one that is going to have to +deal with the indicator sets at some point, and we are the +slowest link. So let us not slow ourselves down more than +necessary. At this point I would like to propose the golden +rule of creating golden rules: an indicator set should be +for humans and machines by humans. + +You may also have noticed that when all these standards +suddendly are combined into one standard, they become less +user-friendly. In other words, let us rather find back to +our common \*NIX roots where each tool had a limited set of +tasks. + +Graphs are essential when writing indicators. Almost +everything in the world around us can be modelled as a +network, and infiltration and persistence in cyberspace is +no exception. Thus, an indicator format needs to be +representable in a graph, and guess what? Almost everything +are as long as it maintains some kind of structure. + +For graphs there are two ways of going about the problem: + +1) Implement the graph in the format + +2) Make sure that you have a good graph backend and a +automatable and traversable format available + +For option 1, the graph in the format will increase the +complexity significantly. Option 2 results in the opposite, +but that does not mean that it can't be converted to a +graph. To make an elaborate discussion short, this is what +we have graph databases for, such as Janusgraph [6]. + + +## A Conceptual View + +Summarizing the above, I'd like to propose the following +requirements for indicator formats: + +1) Indicator sets should be easy and inviting to create + +2) You should be able to start writing at any time, when you +need it + +3) Unnecessary complexity should be avoided + +4) The format should be human readable and editable + +5) A machine should be able to interpret the format + +6) Indicator sets should be graph compatible + +With a basis in this article, I believe that the best +approach is to provide a basic plain text format +specification that inherits from the OpenIOC 1.1 and MITRE +frameworks and references other formats where necessary. + +Let us imagine that we found an IP address in one +situation. The IP-address was connected to a domain that we +found using passive DNS. Further, it was found that a +specific file was associated with that domain through a +Twitter comment. Representing the given information in its +purest (readable) form looks like the following: + + // a test file + class tlp:white + date 2018/02/18 + ipv4 low 188.226.130.166 + domain med secdiary.com + technique PRE-T1146 + filename med some_filename.docx + comment found in open sources + +To recap some of the previous points: the above format is +simple, it can be written at any time based on knowledge of +well known standards. The best of it all is that if you are +heavily invested in specific formats, it can be converted to +them all using a simple interpreter traversing the format. + +Further, such a format is easily converted into a tree and +can be loaded into a graph for traversing and automated +assessments. Each confidence value can be quantified +(``low=0.33``, ``med=0.66``, ``high=1.0``). That said, +simplicity in this case equals actionable indicators. + + | v: 188.226.130.166 (0.33) | match | + | e | | + | v: secdiary.com (0.66) | no match | (0.33+0.66)/2=0.5 + | e | | + | v: some_filename.docx (0.66) | match | + +For networks vs hierarchies: a drawback of the latter, as +mentioned in the former section, is the lack of +e.g. multiple domains being connected to different other +vertices. A practical solution goes as follows: + + ipv4 low 188.226.130.166 + domain med secdiary.com + domain low secdiary.com + ipv4 low 128.199.56.232 + +The graph receiving the above indicator file should identify +the domain as being a unique entity and link the two IP +addresses to the same domain: + + | v: 188.226.130.166 (0.33) + | e: 0.5 + | v: secdiary.com (0.5) + | e: 0.33 + | v: 128.199.56.232 (0.33) + +As for structuring the indicator format for machines in the +practical aspect, consider the following pseudocode: + + indicators = [(0,'ipv4','low','188.226.130.166'),...] + _tree = tree(root_node) + for indicator in indicators + depth = indicator[0] + _tree.insert(indicator,depth) + +Now that we have the tree represented in code, it is +trivially traversable when loading it into some graph: + + method load_indicators(node,depth): + graph.insert(node.parent,edge_label,node) + for child in node.children + load_indicator(child,depth+1) + + load_indicators(tree,0) + +## Summary + +Hopefully I did not kill too many kittens with this +post. You may or may not agree, but I do believe that most +analysts share at least parts of my purist views on the +matter. + +We are currently too focused on supporting standards and +having everyone use as few of them as possible. I believe +that energy is better used on getting more consistent in the +way we document and actually exchange more developed +indicator sets than the md5 hash- and domainlists that are +typically shared today ("not looking at these kinds of files +at all" - even though it's not the worst I've seen: +``MAR-10135536-F_WHITE_stix.xml`` [7]). + +In the conceptual part of this article I propose a simple +but yet effective way of representing indicators in a +practical manner. Frankly, it is even too simple to be +novel. It is just consistent and intutitive. + +PS! For the STIX example above, have a look at the following +to get a feel with the actual content of the file (used one +of the mentioned specimens to show the point): + + class tlp:white + date 2018/02/05 + + sha1 high 4efb9c09d7bffb2f64fc6fe2519ea85378756195 + comment NCCIC:Observable-724f9bfe-1392-456e-8d9b-c143af15f8d4 + comment did not convert all attributes + compiler Microsoft Visual C++ 6.0 + md5 high 3dae0dc356c2b217a452b477c4b1db06 + date 2016-01-29T09:21:46Z + entropy med 6.65226708818 + #sections low 5 + intname med ProxyDll.dll + detection med symantec:Heur.AdvML.B + +The original document states for those same indicators in no less than 119 lines +with an overhead ratio of about 1:5 (it looks completely insane): + + + + + + 3DAE0DC356C2B217A452B477C4B1DB06 + 336073 + PE32 executable (DLL) (console) Intel 80386, for MS Windows + + + MD5 + 3dae0dc356c2b217a452b477c4b1db06 + + + SHA1 + 4efb9c09d7bffb2f64fc6fe2519ea85378756195 + + + SHA256 + 8acfe8ba294ebb81402f37aa094cca8f914792b9171bc62e758a3bbefafb6e02 + + + SHA512 + e52b8878bd8c3bdd28d696470cba8a18dcc5a6d234169e26a2fbd9862b10ec1d40196fac981bc3c5a67e661cd60c10036321388e5e5c1f60a7e9937dd71fadb1 + + + SSDEEP + 3072:jUdidTaC07zIQt9xSx1pYxHvQY06emquSYttxlxep0xnC:jyi1XCzcbpYdvQ2e9g3kp01C + + + + + Microsoft Visual C++ 6.0 + + + Microsoft Visual C++ 6.0 DLL (Debug) + + + 6.65226708818 + + + 5 + 2016-01-29T09:21:46Z + 4096 + + + MD5 + e14dca360e273ca75c52a4446cd39897 + + + + + 0.672591739631 + + + + + + .text + 49152 + + + 6.41338619924 + + + + MD5 + 076cdf2a2c0b721f0259de10578505a1 + + + + + + .rdata + 8192 + + + 3.293891672 + + + + MD5 + 4a6af2b49d08dd42374deda5564c24ef + + + + + + .data + 110592 + + + 6.78785911234 + + + + MD5 + c797dda9277ee1d5469683527955d77a + + + + + + .reloc + 8192 + + + 3.46819043887 + + + + MD5 + fbefbe53b3d0ca62b2134f249d249774 + + + + + + + + + + +[1] STIX: https://oasis-open.github.io/cti-documentation/ +[2] Cybox example: https://github.com/CybOXProject/schemas/blob/master/samples/CybOX_IPv4Address_Instance.xml +[3] MAEC: https://maec.mitre.org/ +[4] OpenIOC to STIX repository on Github: https://github.com/STIXProject/openioc-to-stix +[5] STIX 2.x support (MISP): https://github.com/MISP/MISP/issues/2046 +[6] Janusgraph: http://janusgraph.org/ +[7] MAR-10135536-F_WHITE_stix.xml: https://www.us-cert.gov/sites/default/files/publications/MAR-10135536-F_WHITE_stix.xml diff --git a/data/jnetpcap-tuning.md b/data/jnetpcap-tuning.md new file mode 100644 index 0000000..74a0a66 --- /dev/null +++ b/data/jnetpcap-tuning.md @@ -0,0 +1,84 @@ +It comes a time when programming that one will have to start +paying attention to performance. As this is true in many cases, +there are especially two places that is especially important: With +parallel processing and packet captures. Even better if doing both +at once. In this article we'll keep the latter in mind together +with jNetPcap, a Java wrapper for libpcap able to do 60Kpps per +instance. + +First of all I found an excellent post on performance tuning +jNetPcap. There's also a good implementation example for moving to +the much faster ``JBufferHandler`` [1]. + +One should take note of the ring buffer, that is how much memory +you will have to temporarily store packets if there's a lot of +traffic. Usually this may be e.g. 453k, while the maximum can be +4M (for instance 4078 as it was in my case). For tuning this on +RedHat one may use ``ethtool -g eth0``, and adjust it with +``ethtool -G eth0 rx 4078``. Larger buffers results in high +throughput, but also higher latency (which is not that important +when doing packet captures). More on ethtool and ring buffer +adjustments here. + +When it comes to jNetPcap, the following is an example +implementing it as a Apache Flume source [2]: + + @Override + public void start() { + final ChannelProcessor channel = getChannelProcessor(); + + JBufferHandler jpacketHandler = new JBufferHandler() { + + public void nextPacket(PcapHeader pcapHeader, JBuffer packet, ChannelProcessor channelProcessor) { + int size = packet.size(); + JBuffer buffer = packet; + byte[] packetBytes = buffer.getByteArray(0, size); + + Event flumeEvent = EventBuilder.withBody(packetBytes); + channel.processEvent(flumeEvent); + } + }; + + super.start(); + pcap.loop(-1, jpacketHandler, channel); + + } + +The above shows you a slightly different version than the most +well-documented example (``PcapHandler``) [3]. You should choose +the above one since it is much faster due to the packet +referencing. I did a test on one site and the performance +increased drastically in terms of improving packet loss on the +software-side of things. + +Last but not least, in order to do software side performance +monitoring, you might want to add a handler to capture statistics +in jNetPcap. This is mentioned here in the jNetPcap forums as well +[4]: + +> You can also use PcapStat to see if libpcap is dropping any +> packets. If the buffer becomes full and libpcap can't store a +> packet, it will record it in statistics. This is different from +> the NIC dropping packets. + +This may be implemented in the configuration as shown here: + + PcapStat stats = new PcapStat(); + pcap = Pcap.openLive(device.getName(), SNAPLEN, Pcap.MODE_PROMISCUOUS, timeout, errbuf); + pcap.stats(stats); + +You can get the stats with the following: + + System.out.printf("drop=%d, ifDrop=%d\n",stats.getDrop(), stats.getIfDrop()); + + +Hope this gets you up and running smoothly, tuning packet captures +in chain with parallel computing is a challenge. + +To get some more context you may also like to have a look at the +presentation that Cisco did on OpenSOC, that's how to do it. + +[1] http://jnetpcap.com/node/67 +[2] http://flume.apache.org/ +[3] http://jnetpcap.com/examples/dumper +[4] http://jnetpcap.com/node/704 diff --git a/data/mac-mini-debian.md b/data/mac-mini-debian.md new file mode 100644 index 0000000..8e2fb1c --- /dev/null +++ b/data/mac-mini-debian.md @@ -0,0 +1,173 @@ +There are a lot of guides on booting Linux on an Mac Mini, and the +Mac Mini is absolutely great. There's also a lot of guides which +takes some unnecessary steps on the way from the native OS X +experience to the bloated, and difficult-to-setup Linux on OS +X. Some of them are good on certain points though. + +So, not surprising, I will tell you how to make it work with both +a native EFI installation and the Broadcom BCM4366 up and running. + +Everything will be done on the command line, so this will work +great on servers as well. Of course you won't run wifi on the work +server though (!). + +First, take note that this will wipe almost everything Apple from +you box except the Firmware. You may roll back through pressing +the ALT-key while booting. + +Second, you should use Debian 8.0 "Jessie" (which is currently in +RC1). This is important since Wheezy doesn't support the Broadcom +chipset. + +Prerequisites for this article are: + +* A Mac Mini, tested on an OCT 2014 model +* A keyboard +* A USB memory stick of at least 2GB (speed is the key) + +## 1. Install Debian - and Change Boot Order + +You should create a bootable USB stick for your Debian +installation. When you've downloaded the ISO, you can make it +bootable without hassle through Unetbootin [1]. That one works on +OS X 10.10 "Yosemite" as well. + +When you've got that one ready insert it into the Mini, holding +the ALT-key while booting. You will get to the boot menu, choose +the "EFI" one. This will initiate GRUB from the stick. + +Do the installation as you would on any other machine. Since your +mac is still setup to boot to OS X, we need to change that next in +order to make it point to the Debian installation instead. + +When rebooting, get into the boot menu by holding the ALT-key +again. Select that same GRUB menu again, _BUT_ instead of choosing +to install it you should now press "c" to get to the GRUB command +line. + +It is now time to locate the boot directory [2] on the right +disk. Vary X (disk) and Y (partition table) until you find the +right combination: + + grub> ls (hdX,gptY)/boot/grub + +That may for instance result in: + + grub> ls (hd2,gpt2)/boot/grub + +Set the ``root`` to that disk and partition table, and boot it: + + grub> set root=(hd2,gpt2) + grub> ls -l (hd2,gpt2) + grub> linux /boot/vmlinux[...].efi.signed root=UUID=[uuid from above command] + grub> initrd /boot/initrd[...] + grub> boot + +You will now boot to the one you just installed. It is time to +make it persistent and change the boot order with +``efibootmgr``. First list your current settings by: + + sudo efibootmgr + +Now change the boot order (may vary, point being that Debian +should come first): + + sudo efibootmgr -o 0,1 + +Now reboot and enjoy the darkness without wifi. + +## 2. Get Wifi Up and Running (Offline) + +The current Broadcom chipset is quite new, so you'll need to step +it up to Debian "Jessie" to get it working. Cutting this a bit +short, you will probably need this part to be offline. Showing you +a small trick you can get all those dependencies on a vmware +installation (run the same image as the one you installed, +remember to simulate that you don't have network on that virtual +installation): + + apt-get -qq --print-uris install build-essential linux-headers-$(uname -r) broadcom-sta-dkms patch bzip2 wpasupplicant | cut -d\' -f 2 > urls.txt + +This will produce a file of urls that are all the packages +requested and its dependencies, get the stick, format it with +FAT - and grab the packages to it: + + wget -i urls.txt + +Unmounting that from the virtual installation, insert it into the +physical installation: + + cd /mnt/usb + dpkg -i *.deb + +Remove all modules that may conflict (and blacklist them in +``/etc/modprobe.d/blacklist.config``): + + modprobe -r b44 b43 b43legacy ssb brcmsmac + +Load the Broadcom module: + + modprobe wl + echo wl >> /etc/modules + +Everything that's left now is configuring and starting +wpasupplicant: + + wpa_passphrase [passphrase] > /etc/wpa_supplicant.conf + wpa_supplicant -B -i wlan0 -c /etc/wpa_supplicant.conf + +To make it persistent enable the interface in +``/etc/network/interfaces`` by appending: + + auto wlan0 + iface wlan0 inet dhcp + wpa-conf /etc/wpa_supplicant.conf + + +If you have made an exception in your DHCP pool, you should also +make it static (basic stuff, but anyways): + + auto wlan0 + iface wlan0 inet static + wpa-conf /etc/wpa_supplicant.conf + address 192.168.1.2 + netmask 255.255.255.0 + gateway 192.168.1.1 + +That's basically it. Enjoy the show! + +**Edit 1, FEB 7th 2015:** So I got to play with ``systemd``, since +it turns out a service isn't a service the way it used to be. In +order to start services in Debian "Jessie", you'll need to use +``systemd``. Here's an example for ``znc`` [3]: + + [Unit] + Description=An advanced IRC bouncer + After=network.target oidentd.socket + + [Service] + Type=simple + EnvironmentFile=/etc/conf.d/znc + User=znc + ExecStart=/usr/bin/znc -f $ZNC_OPTIONS + ExecReload=/bin/kill -HUP $MAINPID + + [Install] + WantedBy=multi-user.target + +Also create the directory and drop the following line into +``/etc/conf.d/znc``: ``ZNC_OPTIONS="-d /var/lib/znc"`` + +**Edit 2, FEB 7th 2015:** To enable the Mac Mini to auto-restart +after power failure set the following PCI value [4]: + + setpci -s 0:1f.0 0xa4.b=0 + + + + +[1] http://unetbootin.sourceforge.net/ +[2] +http://askubuntu.com/questions/516535/how-can-i-use-the-installer-to-manually-boot-into-a-system-without-grub-installer +[3] https://gist.github.com/tlercher/3897561 +[4] http://smackerelofopinion.blogspot.no/2011/09/mac-mini-rebooting-tweaks-setpci-s-01f0.html diff --git a/data/maltego-search.md b/data/maltego-search.md new file mode 100644 index 0000000..7b38fb7 --- /dev/null +++ b/data/maltego-search.md @@ -0,0 +1,52 @@ +I've previously been writing on how to read and process Maltego +mtgx graph archives. When you start to get a directory with a lot +of them you will probably be like me "Where did I see this thing +again?" + +The solution can of course be done in Python like in my previous +post, but let's try a more native solution this time, zipgrep: + +> zipgrep will search files within a ZIP archive for lines +> matching the given string or pattern. zipgrep is a shell script +> and requires egrep(1) and unzip(1L) to function. Its output is +> identical to that of egrep(1). + +In my testing I had 20 files, and everything worked pretty well in +regard to searching the files by e.g. ``zipgrep 1.2.3.4 \*.mtgx +\*.graphml``. The problem here being that zipgrep doesn't seem to +support printing the archive names, so thank you for +that. Returning to the more basic zip tools, like zip cat was the +solution in my case: + + unzip -c \*.mtgx 2>&1 |egrep "(Archive: )|1.2.3.4" + + Archive: 1.mtgx + Archive: 2.mtgx + Archive: 3.mtgx + Archive: 4.mtgx + Archive: 5.mtgx + Archive: 6.mtgx + Archive: 7.mtgx + Archive: 8.mtgx + Archive: 9.mtgx + Archive: 10.mtgx + Archive: 11.mtgx + Archive: 12.mtgx + Archive: 13.mtgx + Archive: 14.mtgx + Archive: 15.mtgx + Archive: 16.mtgx + 1.2.3.4 + Archive: 17.mtgx + 1.2.3.4 + Archive: 18.mtgx + Archive: 19.mtgx + Archive: 20.mtgx + +A little Maltego archive insight helps us along speeding up the +query, since the graphml file will always stay at +``Graphs/Graph1.graphml`` + + unzip -c \*.mtgx Graphs/Graph1.graphml 2>&1 |egrep "(Archive: )|1.2.3.4" + +The latter results in the same results as given above. diff --git a/data/matrix.md b/data/matrix.md new file mode 100644 index 0000000..8ce0334 --- /dev/null +++ b/data/matrix.md @@ -0,0 +1,199 @@ +We have all been there during security operations. One of the +parties involved in an incident or daily routine is not prepared +for thinking they could be compromised. + +Communications and information sharing is one of the fundamental +things that you need to get right during a crisis. + +As now-retired FBI director James Comey put it to 60 minutes [1]: + +> There are two kinds of big companies in the United States. There +> are those who've been hacked by the Chinese and those who don't +> know they've been hacked by the Chinese. + +The following question always arises: How do we maintain +operational security while still being able to communicate with +all parties involved? + +In practical terms this requires a communications platform to: + +* Be independent of the service infrastructure +* Provide traceability +* Be resistant to resourceful threat actors +* Have simple and secure identity management +* Have cross-platform compability +* Provide file-sharing capabilities and ability to give the user + an opportunity to express himself +* Support video and audio exchanges +* Be under the control of the team using it (the smallest circle + of trust) +* Provide both end-to-end and transport layer encryption +* Disposable server infrastructure + +This could have been a bit too much to ask for a couple of years +ago, but today there are at least two alternatives satisfying the +above requirements: Mattermost and the Matrix ecosystem. For the +remainder of this post I will focus on how to establish an ad-hoc +system with the tools provided by the Matrix project. + +## Setting Up An Out-of-Band Channel for Incident Handling with Matrix + +Getting started takes three steps: + +1. Establish a back-end server on Digital Ocean +2. Serve the Riot front-end website +3. Establish a recording capability with Matrix Recorder [2] + +For the two first points, it is clever to use an approach that can +be easily reproduced and that provides exactly the same, +secure-by-default configuration each time. Due to this the +preferred method in this case is to manage the VPS that can be +established on anything with Debian or CentOS with Ansible. There +is a script available on Github, known as +matrix-docker-ansible-deploy [3]. The latter have also been +endorsed by the Matrix project [4]. Both 1 and 2 can be +accomplished with ``matrix-docker-ansible-deploy``. + +So let's get started. + +### Basic DNS-service + +For this example I created a domain on namesilo.com and pointed +that to ``(ns1|ns2|ns3).digitalocean.com``. It would be ufortunate +for the continuity of the service if a domain was taken offline or +redirected somewhere, but due to the end to end encryption in +Matrix it would not compromise the content of the +conversations. Now that Digital Ocean has control of the primary +domain, make sure to add the following before continuing: + + Type Hostname Value TTL + A 600 + A riot. 600 + A matrix. 600 + SRV _matrix._tcp. 10 0 8448 matrix. 600 + +This can take some time to propagate, so make sure that the +DNS-infrastructure is readily resolvable before you continue +deploying the services. + +### Configure + +Make sure to grab a copy of the current +``matrix-docker-ansible-deploy`` by running: + + git clone https://github.com/spantaleev/matrix-docker-ansible-deploy.git + +Create the following files: + + inventory/host_vars/matrix./vars.yml + inventory/hosts + +``vars.yml`` should look like this: + + host_specific_matrix_ssl_support_email: + host_specific_hostname_identity: + matrix_coturn_turn_static_auth_secret: "" + matrix_synapse_macaroon_secret_key: "" + +The Ansible ``hosts`` file should be formatted like the following: + + all: + children: + matrix-servers: + hosts: + matrix.: + ansible_user: root + +### Deploy and Execute + +Now that your configuration files and server are ready, you can +start deploying the Matrix Synapse server and start serving the +Riot HTML/JS client. + +First deploy the services (Riot and Matrix Synapse) by running: + + ansible-playbook -i inventory/hosts setup.yml --tags=setup-main + +When that completes successfully, you can start the services by: + + ansible-playbook -i inventory/hosts setup.yml --tags=start + +After starting the services, the Riot web interface is available +on ``https://riot.`` where metadata is protected by a +Let's Encrypt certificate. + +The two primary endpoints you now have exposed to the WWW is: + +* The Matrix API which runs at https://matrix. +* The Riot UI which runs at https://riot. + +Going to ``https://riot.`` brings you to the Riot +logon-screen + +### Adding Users + +Registration is disabled by default on the server, so new users +can be added by the following command: + + ansible-playbook -i inventory/hosts setup.yml + --tags=register-user + --extra-vars='username= + password= + admin=(yes|no)' + +It is better to use pseudonyms on such a platform to make sure no +information can be traced to a specific individual not involved in +the case. Each user needs to verify his private key fingerprint +with the other participants. + +### Vital Steps to Take as an Administrator + +When using multiple servers, it is necessary to create an +``#control`` channel that is a fallback if a server hosting a room +goes down. + +### Setup Matrix Recorder + +To make sure that all communications is stored for traceability +make sure to install the Matrix Recorded (MR). MR should be +installed locally and _not_ on the Matrix server. + + git clone https://gitlab.com/argit/matrix-recorder.git + cd matrix-recorder/ + npm install + +To execute the recorder, run the following. The first time you +will be asked to enter the login credentials of the user. + + $ node matrix-recorder.js + Loading olm... + Your homeserver (give full URL): https://matrix. + Your username at the homeserver: + Your password at the homeserver: + No of items to retrieve for initial sync: 1000 + [...] + +View messages as HTML by running the Matrix Recorder conversion +script: + + node recorder-to-html.js + +### Controlling Logins + +Access monitoring can be done in the console by e.g. ``tail -f +/matrix/synapse/run/homeserver.log``. + +### The Power of Disposability + +At some point you have finished the information exchange. The +beauty of this setup is that is can now be safely deleted from the +Digital Ocean droplet console. + + +[1] James Comey and 60 minutes: https://www.cbsnews.com/news/fbi-director-james-comey-on-threat-of-isis-cybercrime/ + +[2] Matrix Recorder: https://matrix.org/docs/projects/other/matrix-recorder.html + +[3] matrix-docker-ansible-deploy: https://github.com/spantaleev/matrix-docker-ansible-deploy + +[4] Matrix project endorsement: https://matrix.org/blog/2018/06/01/this-week-in-matrix-2018-06-01/ diff --git a/data/microsoft-dominating-email.md b/data/microsoft-dominating-email.md new file mode 100644 index 0000000..762691b --- /dev/null +++ b/data/microsoft-dominating-email.md @@ -0,0 +1,159 @@ +## Key Takeaways + +* While market dominance was formerly an issue discussed for + operating systems, the modern equivalent occurs in form of cloud + services, primarily from Microsoft, Amazon and Google. + +* Data from the Norwegian business registry mapped to email + records shows that Microsoft Office 365 has become a dominating + force amongst Norwegian private businesses and 61% of the + government. + +* Microsoft being a significant actor for email indicates that + Norwegian organisations are putting a lot more faith in + Microsoft. Today email as a service is bundled with direct + messaging and wikis. + +## Introduction + +In 2003 Dan Geer, Bruce Schneier and others wrote a paper named +"How the Dominance of Microsoft's Products Poses a Risk to +Security". It eventually cost Geer his job at AtStake. + +The paper evolves around Microsoft's dominance in operating +systems and Geer has later given Microsoft credit for a better +approach to security [2]. + +In this article I am not going to reiterate on the points made by +Geer et àl. I think these are perfectly valid and easily +transferrable to the current landscape. The whole paper is +read-worthy, but I'd like highlight one part: + +> Governments, and perhaps only governments, are in leadership +> positions to affect how infrastructures develop. By enforcing +> diversity of platform to thereby blunt the monoculture risk, +> governments will reap a side benefit of increased market +> reliance on interoperability, which is the only foundation for +> effective incremental competition and the only weapon against +> end-user lock-in. A requirement that no operating system be more +> than 50% of the installed based in a critical industry or in a +> government would moot monoculture risk. Other branches to the +> risk diversification tree can be foliated to a considerable +> degree, but the trunk of that tree on which they hang is a total +> prohibition of monoculture coupled to a requirement of +> standards-based interoperability. + +Azure is Windows in 2021. The walled gardens are somewhat +redefined - but they are there in a similar fashion as Windows was +in 2003. The Microsoft monopoly is technically broken, and there +are now options from Amazon, Google and even Apple, but I would +argue the monoculture is still present in shared approaches, +infrastructure and concepts. + +I decided to have a closer look at the distribution from a +representative dataset provided by an authorative source in +Norway; the business registry. + +## Taking a Close Look at The Data + +In Norway we a public registry of organisations. This registry is +categorised by standardised sector codes (typically "government", +"private" and so on). Using the JSON-data provided by brreg.no, a +list of websites can be extracted: + + 1. Retrieve the organisation list from brreg.no [1] + +``` + curl https://data.brreg.no/enhetsregisteret/api/enheter/lastned > enheter.gz + gzip -d enheter.gz +``` + + 2. Reshape the JSON data by website URL, sector and business code. + +``` + cat enheter | + jq '[.[] | select(.hjemmeside != null) | {url:.hjemmeside, code:.naeringskode1.kode, sector:.institusjonellSektorkode.kode}]' > webpages.txt +``` + + 3. Based on the URL, add the primary domain and resolve its MX + record and the MX primary domain to each JSON entity + + 4. Using the JSON-file generated above, populate the following + JSON dictionary. This is also a rough categorisation based on + the standard provided by Statistics Norway (I'm sure it could + be improved) [4]: + +``` + { + "government":{"codes": [6100,6500,1110,1120], "total":0, "counts":{}}, + "municipals":{"codes": [1510,4900,1520], "total":0, "counts":{}}, + "finance":{"codes": [3200,3500,3600,4300,3900,4100,4500,4900,5500,5700,4900,7000], "total":0, "counts":{}}, + "private":{"codes": [4500,4900,2100,2300,2500], "total":0, "counts":{}} + } +``` + + 5. Generate CSV output based on each sector grouping above. + + +## The Result + +The top vendor was not surprising Microsoft's outlook.com. For the +120k sites, 98k resolved an MX record. Of these I will give an +outlook.com summary as follows, as it would seem this is the +dominating actor in all categories: + +* In government 61% is O365 users (1420/2317) + +* For municipals, the amount is 55% (688/1247) + +* For the diverse financial grouping, 21% uses O365 (4836/23125) + +* For the diverse private companies 38% uses O365 (14615/38129) + +Of the 98k sites Microsoft runs the email service for 21559 +organisations. For comparison Google MX domains accounts +for about 5500. + +While the above are directly a measurement of who delivers email +services, it also indicated that these organisations relies on +other services, such as internal wikis and direct messaging. + +An overview of the top 10 vendors are shown below. + +![](static/img/data/mx_domains.png) + + + +## Sources of Errors + +Even though I believe the statistics above is representative it +has some possible sources of error: + +1. The organisation isn't listed with URL in the organisation + registry or it uses a domain not associated with the primary + domain of its web address + +2. The organisation uses an SMTP proxy + +3. The organisation has an inactive SMTP record + +I found that there are more than 1 million listed organisations in +the brreg.no registry and 120k websites in the JSON data +provided. This means this dataset represent at most 12% of the +companies listed. + +Also, email doesn't represent a diverse infrastructure, but I +believe it is an indicator of the current trends also for other +cloud services in e.g. Azure, Google Compute Engine and so on. + + + +[1] CyberInsecurity: The Cost of Monopoly, Geer et àl, 2003 - +https://cryptome.org/cyberinsecurity.htm + +[2] Cybersecurity as Realpolitik by Dan Geer presented at Black +Hat USA 2014: https://www.youtube.com/watch?v=nT-TGvYOBpI + +[3] https://data.brreg.no/enhetsregisteret/api/enheter/lastned + +[4] https://www.ssb.no/klass/klassifikasjoner/39 diff --git a/data/msg-eml.md b/data/msg-eml.md new file mode 100644 index 0000000..e26798c --- /dev/null +++ b/data/msg-eml.md @@ -0,0 +1,58 @@ +Thought I’d share a neat little script-combo if you do your +email analysis on Linux systems, or do automation. For the +task you’ll need msgconvert.pl [1] and ripmime [2]. + +MSG files are used by Microsoft Outlook, and is the natural +fit in regard to malicious messages in organizations running +Microsoft products. For reference you can find the +specification for the Outlook Item File Format here. + +In this part you will require a file from Outlook, which you +can acquire by selecting a message and drag it to the +desktop or a new message. If you don’t do Outlook, you can +just google for one [3]. + + msgconvert.pl .msg + ripmime -i .mime + +The above will first convert the MSG file to a mime +file. The latter command will make sure to extract the +objects in it, such as binary files or documents. The text +files contains the content of the email and will be +something like: textfile0 + +If you need the headers you will find them at the top of the +mime-file. + +Now to EML-files, which you will also often find when +exporting email messages. EML is really just short for +“E-mail”. In OS X Mail, Outlook Express, Thunderbird (and +others) you are typically presented with EML/MIME-formatted +documents, and it’s just a document which complies with RFC +822 [4]. EML-files are more easy to work on since you can +open it in a text editor and read the essential information +plain straight away. + +So what does that mean in regard to ripmime? It really just +means that instead of calling the output from msgconvert.pl +.mime, you can name the file .eml. In +commands: + + ripmime -i .eml + +The above will output your mime parts. + +## OS X Specifics + +You may want to do the above on an OS X system as well. For +this you can install ripmime via Homebrew [5]. + +If you are exporting an eml from Apple Mail you may do so +the same way as in Outlook: Just drag it where you want it. + + +[1] https://www.matijs.net/software/msgconv/ +[2] https://www.pldaniels.com/ripmime/ +[3] https://www.google.com/search?q=filetype:msg&oq=filetype:msg#q=filetype:msg+outlook +[4] https://tools.ietf.org/html/rfc822 +[5] https://brew.sh/index_nb diff --git a/data/new-format.md b/data/new-format.md new file mode 100644 index 0000000..02a9fc0 --- /dev/null +++ b/data/new-format.md @@ -0,0 +1,70 @@ +After being off the HTML grid for a while, using Hugo as a +static site generator for Gopher. I went tired of the +upgrade and complexity issues with publishing new +content. It all culminated with Hugo refusing to generate +the site at all after the last update. + +Because of the Hugo failure I needed to create a new +strategy, and not being willing to change to another complex +generator system I went hunting for something else. + +I am happy with my current backend publishing setup, which +uses git and a post-receive hook: + + pwd=$(pwd) + if test -z "${pwd##*.git}" + then repo="$pwd" + else repo="$pwd/.git" + fi + + git --work-tree=~/secdiary/content --git-dir=~/secdiary/content.git checkout -f + cd ~/secdiary + rm -r /var/www/secdiary.com/* + rm -r /var/gopher/* + cp -R html/* /var/www/secdiary.com/ + cp -R gopher/* /var/gopher/ + + cp ~/twtxt/content/twtxt.txt /var/www/secdiary.com/ + + echo "\nBuild: " >> /var/gopher/index.gph + git --git-dir=~/secdiary/content.git log -1 --pretty="%H%n%ci" >> /var/gopher/index.gph + +I also publish twtxt messages in a similar way. My twtxt +config looks like the following: + + [twtxt] + nick = tommy + twtfile = ~/twtxt/twtxt.txt + twturl = http://secdiary.com + disclose_identity = False + character_limit = 140 + character_warning = 140 + post_tweet_hook = "cd ~/twtxt/ && git pull && git add twtxt.txt && git commit -m 'added new tweet' && git push" + +In addition to my twtxt feed, I am present on Mastodon, +which lead me to Solene's static site generator cl-yag +[1,2]. I decided to generate the site client-side for +now, but in the future I'll likely move this to the server +for less complex workflows on my workstations. This also +fits me well since I'll be moving more of my workflow to +OpenBSD in the coming months. + +The layout of my new site is more or less shamelessly stolen +from Solene as well. I plan to customize that to my liking as +we go. + +And with that I am back in the WWW space, however in a +limited format. I am currently reviewing my 50 current +posts and will assess what can be of use in the future. This +will involve some rewriting as well, since this space will +be text-only out of respect for your time. + +I also enabled TLS on the site for those that would like to +browse privately, opposed to my current Gopher setup. The +latter you may find on ``gopher://secdiary.com``. + +Feel free to reach out to me in the Fediverse. I'm there as +@tommy@cybsec.network. + +[1] https://dataswamp.org/\~solene/2018-10-12-cl-yag-20181012.html +[2] git://bitreich.org/cl-yag diff --git a/data/novel-pdf-detection.md b/data/novel-pdf-detection.md new file mode 100644 index 0000000..45d65fe --- /dev/null +++ b/data/novel-pdf-detection.md @@ -0,0 +1,792 @@ +For some time now the Portable Document Format standard has +been a considerable risk in regard to corporate as well as +private information security concerns. Some work has been +done to classify PDF documents as malicious or benign, but +not as much when it comes to clustering the malicious +documents by techniques used. Such clustering would provide +insight, in automated analysis, to how sophisticated an +attack is and who staged it. A 100.000 unique PDF dataset +was supplied by the Shadowserver foundation. Analysis of +experiment results showed that 97% of the documents +contained javascripts. This and other sources revealed that +most exploits are delivered through such, or similar object +types. Based on that, javascript object labeling gets a +thorough focus in the paper. + +The scope of the paper is limited to extend the attribution +research already done in regard to PDF documents, so that a +feature vector may be used in labeling a given (or a batch) +PDF to a relevant cluster. That as an attempt to recognize +different techniques and threat agents. + +> Javascript is currently one of the most exploited PDF + objects. How can the PDF feature vector be extended to + include a javascript subvector correctly describing the + technique/style, sophistication and similarity to previous + malicious PDF documents. How does it relate to the term + digital evidence? +> — Problem statement + +The problem statement considers the coding styles and +obfuscation techniques used and the related sophistication +in the coding style. Least but most important the statement +involves how the current PDF document measures to others +previously labeled. These are all essential problems when it +comes to automatated data mining and clustering. + +### A. Related Work + +Proposed solutions for malicious contra benign +classification of PDF documents has been explicitly +documented in several papers. Classification using support +vector machines (SVM) was handled by Jarle Kittilsen in his +recent Master's thesis1. + +Further, the author of this paper in his bachelor's thesis2 +investigated the possibility to detect obfuscated malware by +analyzing HTTP data traffic known to contain malware. In +regard, the findings were implemented, designed and tested +in Snort. Some of the detection techniques will be used as a +fundament for labeling in this paper. + +Even though much good work has been done in the era of +analyzing malicious PDF documents, many of the resulting +tools are based on manual analysis. To be mentioned are +Didier Stevens who developed several practical tools, such +as the PDF parser and PDFid. These tools are not only tools, +but was the beginning of a structured way of looking at +suspicious objects in PDF documents as well. To be credited +as well is Paul Baccas in Sophos, which did considerable +work on characterizing malicious contra benign PDF +documents3. + +The paper will be doing research into the feature, +javascript subvector of malicious PDF documents. To be able +to determine an effective vector (in this experimental +phase), it is essential that the dataset is filtered, +meaning that the files must be malicious. As Kittilsen has +done in regard to PDF documents, Al-Tharwa et ál2 has done +interesting work to detect malicious javascript in browsers. + +## Background +### A.1. The Feature Vector in Support of Digital Evidence + +Carrier and Spafford defined "digital evidence" as any +digital data that contain reliable information that supports +or refutes a hypothesis about the incident7. Formally, the +investigation process consists of five parts and is +specially crafted for maintaining evidence integrity, the +order of volatility (OOV) and the chain of custody. This all +leads up to the term forensic soudness. + +The investigation process consists of five phases. Note the +identification and analysis phase. + +![Fig. 1: The investigation process. The investigation + process consists of five phases9. Note the identification + and analysis + phase](/images/2015/02/Theinvestigationprocess-e1380485641223.png) + +In this paper, forensic soudness is a notion previously +defined10 as meaning: No alternation of source data has +occured. Traditionally this means that every bit of data is +copied and no data added. The previous paper stated two +elementary questions: + +* Can one trust the host where the data is collected from? +* Does the information correlate to other data? + +When it comes to malicious documents, they are typically +collected in two places: + +1. In the security monitoring logging, the pre-event phase +2. When an incident has occured and as part of the reaction to an + incident (the collection phase) + +Now, the ten thousand dollar question: When a malicious +document gets executed on the computer, how is it possible +to get indications that alteration of evidence has occured? +The answer is potentially the first collection point, the +pre-event logging. + +In many cases, especially considering targeted attacks, it +is not possible to state an PDF document as malicious in the +pre-event phase. The reason for this is often the way the +threat agent craft his attack to evade the security +mechanisms in the target using collected intelligence. Most +systems in accordance to local legislation should then +delete the content data. A proposition though is to store +the feature vector. + +The reasoning behind storing a feature vector is quite +simple: When storing hashes, object counts and the +javascript subvector which we will return to later in the +paper, it will be possible to indicate if the document +features has changed. On the other side there is no +identifiable data invading privacy. + +It is reasonable to argue that the measure of how similar +one PDF document is to another, is also the measure of how +forensically sound the evidence collected in a post-event +phase is. How likely it is that the document aquired in the +collection phase is the same as the one in the pre-phase is +decided by the characteristics supplied by the feature +vectors of both. Further, the feature-vector should be as +rich and relevant as possible. + +![Fig. 2: Correlation by using the feature vector of the PDF + document. Illustration of a possible pre/post incident + scenario](/images/2015/02/Preandpost.png) + +### A.2. Identification as an Extension of Similarity + +The notion of similarity largely relates to the feature +vector: How is it in large quantities of data possible to +tell if the new PDF document carries similar characteristics +like others of a larger dataset. + +In his work with semantic similarity and preserving hashing, +M. Pittalis11 defined similarity from the Merriam-Webster +dictionary: + +> Similarity: The existance of comparable aspect between two +> elements +> – Merriam-Webster Dictionary + +The measure of similarity is important in regard to +clustering or grouping the documents. When clustering +datasets the procedure is usually in six steps, finding the +similarity measure is step 2. + +1. Feature selection +2. Proximity/similarity measure +3. Clustering criterion +4. Clustering algorithm +5. Validation +6. Interpretation + +In this paper the k-means unsupervised learning clustering +algorithm was consideres. This simple algorithm groups the +number n observations into k clusters22. Each observation +relates to the cluster with the nearest mean. + +Now, as will be seen over the next two sections, work done +in the subject is mostly missing out on giving a valid +similarity measure when it comes to classifying PDF +documents as anything other than malicious or benign. So, to +be able to cluster the PDF documents the feature vector will +need a revision. + +As Pittalis introduced the concept of similarity, it is +important to define one more term: Identification. According +to the American Heritage Dictionary, identification is: + +> Proof or Evidence of Identity. +> — The American Heritage Dictionary + +In our context this means being able to identify a PDF +document and attribute it to e.g. a certain type of botnet +or perhaps more correct a coding or obfuscation +technique. In an ideal state this will give an indication to +which threat agent is behind the attack. This is something +that has not been researched extensively in regard to PDF +documents earlier. + +### C. The Portable Document Format + +When it comes to the feature vector of the portable document +format (PDF), it is reasonable to have a look at how PDF +documents are structured. The PDF consists of objects, each +object is of a certain type. As much research has been done +on the topic previously, the format itself will not be +treated any further in this paper12. + +![A simplified illustration of the portable document format](/images/2015/02/ObjectdescriptionPDF-2.png) + +When considering malicious PDF documents, relevant +statistics has shown the following distribution of resource +objects: + +**Known Malicious Datasets Objects** A table showing a +number interesting and selected features in malicious seen +against clean PDF documents. Baccas used two datasets where +one indicated slightly different results. + + Dataset Object Type Clean (%) Malicious (%) + The Shadowserver 100k PDF malicious dataset /JavaScript NA 97% + -- + Paul Baccas' Sophos 130k malicious/benign dataset3 /JavaScript 2% 94% + /RichMedia 0% 0,26% + /FlateDecode 89% 77% + /Encrypt 0,91% 10,81% + +What can be seen of the table above is that when it comes to +the distribution of objects in malicious files, most of them +contains javascript. This makes it very hard to distinguish +and find the similarity between the documents without +considering a javascript subvector. The author would argue +that this makes it a requirement for a javascript subvector +to be included in the PDF feature vector to make it a +valid. In previous work, where the aim has been to +distinguish between malicious and benign, this has not been +an issue. + +### D. Closing in on the Core: The PDF Javascript Feature Subvector + +Javascript is a client-side scripting language primarily +offering greater interactivity with webpages. Specifically +javascript is not a compiled language, weakly-typed4 and has +first-class functions5. In form of rapid development, these +features gives great advantages. In a security perspective +this is problematic. The following states a Snort signature +to detect a javascript "unescape"-obfuscation technique2(we +will return to the concept of obfuscation later on): + + alert tcp any any -> any any (msg:”Obfuscated unescape”; sid: 1337003; content:”replace”; pcre:”/u.{0,2}n.{0,2}e.{0,2}s.{0,2}c.{0,2}a.{0,2}p.{0,1}e’ ?.replace (/”;rev:4;) + +Traditionally javascript is integrated as a part of an +browser. Seen from a security perspective, this opens for +what is commonly known as client-side attacks. More +formally: Javascript enables programmatic access to +computational objects within a host environment. This is +complicated as javascript comes in different flavors, making +general parsing and evaluation complex6, as may be seen of +the above signature. The flavors are often specific to the +application. Today, most browsers are becoming more aligned +due to the requirements of interoperability. Some +applications, such as the widely deployed Adobe Reader has +some extended functionality though, which we will be +focusing on in this paper. + +Even though javascript may pose challenges to security, it +is important to realize that this is due to +complexity. Javascript (which is implemented through +SpiderMonkey in Mozilla18-products and in Adobe Reader as +well) builds on a standard named ECMA-262. The ECMA is an +standardization-organ of Information and Communication +Technology (ICT) and Consumer Electronics (CE)17. Thus, +Javascript is built from the ECMAScript scripting language +standard. To fully understand which functions is essential +in regard to malicious Javascripts this paper will rely on +the ECMAScript Language Specification19 combined with expert +knowledge. + +### E. Introducing Obfuscation + +Harawa et al.8 describes javascript obfuscation by six elements: + +* Identifier reassignment or randomization +* Block randomization +* White space and comment randomization +* Strings encoding +* String splitting +* Integer obfuscation + +Further, Kittilsen1 documented a javascript feature vector +which states the following functions as potentially +malicious: [function, eval_length, max_string, stringcount, +replace, substring, eval, fromCharCode]. Even though his +confusion matrix shows good results, there are some problems +when it comes to evaluating these as is: Such characters are +usually obfuscated. The following is an example from sample +``SHA256:d3874cf113fa6b43e7f6e2c438bd500edea5cae7901e2bf921b9d0d2bf081201]``: + + if((String+'').substr(1,4)==='unct'){e="".indexOf;}c='var _l1="4c206f5783eb9d;pnwAy()utio{.VsSg',h<+I}*/DkR%x-W[]mCj^?:LBKQYEUqFM';l='l';e=e()[((2+3)?'e'+'v':"")+"a"+l];s=[];a='pus'+'h';z=c's'+"ubstr" [1];sa [2];z=c's'+"ubstr" [3];sa [2];z=c['s'+"ubstr"] [...]e(s.join(""));} + +The above example tells an interesting story about the +attackers awareness of complexity. In respect to Kittilsens +javascript feature vector the above would yield the +following result: [0,x,x,x,0,0,0,0] (considerable results on +the second to fourth, plus one count if we are to shorten +substring to substr), in other words the features are to be +found in the embedded, obfuscated javascript, but not in +clear text. When it comes to eval_length, max_string and +string_count we will return to those later in the paper. + +Deobfuscated, the script would look like: + + var _l1="[...]";_l3=app;_l4=new Array();function _l5(){var _l6=_l3.viewerVersion.toString();_l6=_l6.replace('.','');while(_l6.length&4)_l6l='0';return parsetnt(_l6,10);function _l7(_l8,_l9){while(_l8.length+2&_l9)_l8l=_l8;return _l8.substring(0,_l9I2);function _t0(_t1){_t1=unescape(_t1);rote}a*=_t1.length+2;da*/ote=unescape('Du9090');spray=_l7(da*/ote,0k2000Rrote}a*);lok%hee=_t1lspray;lok%hee=_l7(lok%hee,524098);for(i=0; i & 400; ill)_l4xi-=lok%hee.substr(0,lok%hee.lengthR1)lda*/ote;;function _t2(_t1,len){while(_t1.length&len)_t1l=_t1;return _t1.substring(0,len);function _t3(_t1){ret='';for(i=0;i&_t1.length;il=2){b=_t1.substr(i,2);c=parsetnt(b,16);retl=String.froW[har[ode(c);;return ret;function _]i1(_t1,_t4){_t5='';for(_t6=0;_t6&_t1.length;_t6ll){_l9=_t4.length;_t7=_t1.char[odeAt(_t6);_t8=_t4.char[odeAt(_t6D_l9);_t5l=String.froW[har[ode(_t7m_t8);;return _t5;function _t9(_t6){_]0=_t6.toString(16);_]1=_]0.length;_t5=(_]1D2)C'0'l_]0j_]0;return _t5;function _]2(_t1){_t5='';for(_t6=0;_t6&_t1.length;_t6l=2){_t5l='Du';_t5l=_t9(_t1.char[odeAt(_t6l1));_t5l=_t9(_t1.char[odeAt(_t6));return _t5;function _]3(){_]4=_l5();if(_]4&9000){_]5='oluAS]ggg*pu^4?:IIIIIwAAAA?AAAAAAAAAAAALAAAAAAAAfhaASiAgBA98Kt?:';_]6=_l1;_]7=_t3(_]6);else{_]5='*?lAS]iLhKp9fo?:IIIIIwAAAA?AAAAAAAAAAAALAAAAAAAABk[ASiAgBAIfK4?:';_]6=_l2;_]7=_t3(_]6);_]8='SQ*YA}ggAA??';_]9=_t2('LQE?',10984);_ll0='LLcAAAK}AAKAAAAwtAAAALK}AAKAAAA?AAAAAwK}AAKAAAA?AAAA?gK}AAKAAAA?AAAAKLKKAAKAAAAtAAAAEwKKAAKAAAAwtAAAQAK}AUwAAA[StAAAAAAAAAAU}A]IIIII';_ll1=_]8l_]9l_ll0l_]5;_ll2=_]i1(_]7,'');if(_ll2.lengthD2)_ll2l=unescape('D00');_ll3=_]2(_ll2);with({*j_ll3;)_t0(*);Ywe123.rawValue=_ll1;_]3(); + +Which through the simple Python script javascript feature +vector generator (appendice 1), yields: + + ['function: 9', 'eval_length: x', 'max_string: x', 'stringcount: x', 'replace: 1', 'substring|substr: 4', 'eval: 0', 'fromCharCode: 0'] + +Harawa et al.' 6 elements of javascript obfuscation is +probably a better, or necessary supplemental approach to +Kittilsens work. + +There is a notable difference between deobfuscation and +detecting obfuscation techniques. The difference consists of +the depth of insight one might gain in actually +deobfuscating a javascript as it will reveal completely +different code while the obfuscation routines may be based +on a generic obfuscator routine used by several threat +agents. This is much like the issue of packers in regard to +executables23. + +This section has shown the difficulties of balancing +deobfuscation for a more detailed coding style analysis +against a less specific feature vector by using abstract +obfuscation detection. + +## Extracting and Analysing a PDF Feature Vector + +### A. Deobfuscation - Emerging Intentions + +Usually the most pressing question when an incident +involving a PDF document occur is: Who did it, and what's +his intentions. This is also a consideration when further +evolving the PDF feature vector. In the next figure is a +model describing three groups of threat agents, where one +usually stands out. Such as if a Stuxnet scale attack24 +involving a PDF document is perceived it will be associated +with a cluster containing "group 1" entities. + +While Al-Tharwa et ál2 argues for no need for deobfuscation +in regard to classification, deobfuscation is an important +step in regard to finding a distinct feature vector. The +issue is that in most situations it isn't good enough to +tell if the documents is malicious, but also in addition to +who, what, where and how it was created. In regard to being +defined as valid digital evidence a rich feature vector (in +addition to the network on-the-fly hash-sum) is part of +telling. The latter also makes itself relevant when it comes +to large quantities of data, where an analyst is not capable +of manually analyzing and identifying hundreds to tens of +thousands of PDF documents each day. + +![Fig. 4: The threat agent modelA model describing three + groups of attackers. These are necessary to filter and + detect in the collection + phase](/images/2015/02/threat-agent-model.png) + +### B. Technical Problems During Deobfuscation + +Normally most javascript engines, such as Mozillas +Spidermonkey15, Google V816 and others, tend to be +javascript libraries for browsers and miss some basic +functionality in regard to Adobe Reader which is the most +used PDF reader. These engines is most often used for +dynamic analysis of Javascripts and is a prerequiste when it +comes to being able to completely deobfuscate javascripts. + +To prove the concepts of this article a static Python +feature vector generator engine based on a rewritten version +of the Jsunpack-n14project is used. The application used in +the paper is providing a vector based interpretation of the +static script, meaningn it is not run it dynamically. + +Reliably detecting malicious PDF documents is a challenge +due to the obfuscation routines often used. This makes it +necessary to perform some kind of deobfuscation to reveal +more functionality. Even if one managed to deobfuscate the +script one time, there may be several rounds more before it +is in clear text. This was a challenge not solvable in the +scope of this article. + +Due to parsing errors under half of the Shadowserver 100k +dataset was processed by the custom Jsunpack-n module. + +### C. Introducing Two Techniques: Feature Vector Inversion and Outer Loop Obfuscation Variable Computation + +As have been very well documented so far in the paper it is +more or less impossible to completely automate an +deobfuscation process of the PDF format. Obfuscation leaves +many distinct characteristics though, so the threat agent on +the other hand must be careful to not trigger anomaly +alarms. There is a balance. This part of the article +introduces two novel techniques proposed applied to the +javascript subvector to improvie its reliability. + +#### C.1. Outer Loop Obfuscation Variable Computation (OLOVC) + +When the threat agent implements obfuscation, one of his +weaknesses is being detected using obfuscation. When it +comes to PDF documents using javascripts alone is a +trigger. Now, the threat agent is probably using every trick +in the book, meaning the 6 elements of javascripts +obfuscation8. The job of an analyst in such a matter will be +to predict new obfuscation attempts and implement anomaly +alerts using the extended PDF feature vector. + +Throughout this paper we will name this technique "Outer +Loop Obfuscation Variable Computation". The term "outer +loop" most often refer to round zero or the first of the +deobfuscation routines. Variable computation is as its name +states, a matter of computing the original javascript +variable. As we have seen this may be done by either +deobfuscating the script as a whole including its +near-impossible-for-automation complexity, or use the +original obfuscated data. We will have a further look at the +latter option. + +Take for instance this excerpt from the "Introducing Obfuscation"-section: + + z=c['s'+"ubstr"](0,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](2,1);s[a](z);z=c['s'+"ubstr"](3,1);s[a](z);z=c['s'+"ubstr"](4,1);s[a](z);z=c['s'+"ubstr"](5,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](7,1);s[a](z);z=c['s'+"ubstr"](8,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](10,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](13,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](14,1);s[a](z);z=c['s'+"ubstr"](12,1);[...](20,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](18,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](13,1);s[a](z);z=c['s'+"ubstr"](19,1);s[a](z);z=c['s'+"ubstr"](11,1);s[a](z);z=c['s'+"ubstr"](14,1);s[a](z);z=c['s'+"ubstr"](17,1);s[a](z);z=c['s'+"ubstr"](12,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](1,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z);z=c['s'+"ubstr"](9,1);s[a](z);z=c['s'+"ubstr"](6,1);s[a](z); + + +Harawa ét al defined the above obfuscation technique as +"string splitting" (as seen in the section "Introducing +obfuscation"). The following two obfuscation-extraction +regular expressions, is previously stated in the authors +Bachelors thesis2: + + e.{0,2}v.{0,2}a.{0,2}l.{0,1} + + u.{0,2}n.{0,2}e.{0,2}s.{0,2}c.{0,2}a.{0,2}p.{0,1}e + +Keep the two above statements and the previous code excerpt +in mind. When breaking down the above expressions we +introduce one more regular expression: + + s.{0,4}u.{0,4}b.{0,4}s.{0,4}t.{0,4}r.{0,4} + +While searching for "substr" in plain text in the plain-text +will certainly fail, the above expression will match e.g.: + + 's'+"ubstr" + +Recall Kittilsens javascript feature vector: ``[function, +eval_length, max_string, stringcount, replace, substring, +eval, fromCharCode]``. If extended by the above techniques, +the results is somewhat different. + +Without string splitting detection: + + ['function: 9', 'eval_length: x', 'max_string: 10849', 'stringcount: 1', 'replace: 1', 'substring|substr: 4', 'eval: 0', 'fromCharCode: 0'] + +With outer loop obfuscation variable computation: + + ['function: 0', 'eval_length: x', 'max_string: 67', 'stringcount: 2', 'replace: 0', 'substring: 0', 'substr: 3663', 'eval: 1', 'fromCharCode: 0'] + +Additionally, rewriting and extending Kittilsens feature +vector by several other typically suspicious functions +should give preferrable results: ``[max_string, stringcount, +function, replace, substring, substr, eval, fromCharCode, +indexof, push, unescape, split, join, sort, length, +concat]`` + +This makes the following results in two random, but related, samples: + + [SHA256:5a61a0d5b0edecfb58952572addc06f2de60fcb99a21988394926ced4bbc8d1b]:{'function': 0, 'sort': 0, 'unescape': 0, 'indexof': 0, 'max_string': 10849, 'stringcount': 2, 'replace': 0, 'substring': 0, 'substr': 1, 'length': 1, 'split': 2, 'eval': 0, 'push': 0, 'join': 1, 'concat': 0, 'fromCharCode': 0} + + [SHA256:d3874cf113fa6b43e7f6e2c438bd500edea5cae7901e2bf921b9d0d2bf081201]:{'function': 0, 'sort': 0, 'unescape': 0, 'indexof': 0, 'max_string': 67, 'stringcount': 1, 'replace': 0, 'substring': 0, 'substr': 3663, 'length': 0, 'split': 0, 'eval': 0, 'push': 1, 'join': 1, 'concat': 0, 'fromCharCode': 0} + +It may perhaps not need a comment, but in the above results +we see that there are two types of elements in the feature +vector that stands out: max_string and two of the suspicious +functions. + +Summarized the "Outer Loop Obfuscation Variable Computation" +may be used to, at least partially, defeat the malware +authors obfuscation attempts. By running the somewhat +complex regular expressions with known malicious obfuscation +routines, the implementation result of the 100.000 PDF +dataset may be seen in the following table: Dataset +generalization by "outer loop obfuscation variable +computation" Dataset aggregated by counting javascript +variables and functions, OLOVC applied (due to errors in the +jsunpack-n the total number of entities calculated is +42736). + + Word Count + function 651 + sort 7579 + unescape 4 + toLowerCase 1 + indexof 8 + max_string 42346 + stringcount 41979 + replace 70 + substring 91 + replace 70 + substring 91 + substr 38952 + length 1512 + split 9621 + eval 77 + push 260 + join 91 + inverse_vector 41423 + concat 86 + fromCharCode 45 + +By the counts in the above table it is shown that the +selected feature vector has several very interesting +features. On a sidenote: Even though some features has a +larger quantity than others it should be mentioned that this +is not necessarily the measure of how good that feature is, +such is especially the case with the inverse vector as we +will be more familiar with in the next section. Also, as +previously mentioned it is interesting to see the +composition of multiple features to determine the origin of +the script (or the script style if you'd like). The +aggregation script is attached in appendice 2. + +The "Outer Loop Obfuscation Variable Computation" will +require a notable amount of computational resources in +high-quantity networks due to the high workload. In a way +this is unavoidable since the threat agents objective of +running client-side scripts is to stress the resources of +such systems. + +![Fig. 5: Illustration of Computational Complexity. The illustration shows the computational load on a network sensor in regard to different obfuscation techniques](/images/2015/02/Skjermbilde-2012-05-08-kl--20-43-04.png) + +### C.2. Feature Vector Inversion + +Threat agents go a long way in evading detection +algorithms. The following thought is derived from a common +misconception in database security: + +> A group of ten persons which names are not to be revealed + is listed amongst a couple of thousands, in an + organizations LDAP directory. The group, let us name it X, + is not to be revealed and is therefore not named in the + department field. + +While the public may not search and filter directly on the +department name, being X, an indirect search would be +succesful to reveal the group due to the ten persons being +the only ones not associated with a department. + +The concept of searching indirectly may be applied to +evaluating javascripts in PDF documents as well. We might +start off with some of the expected characters found in +benign javascript documents: + + {'viewerVersion':1,'getPrintParams':1,'printd':1,'var':10,'getPageNthWord':1,'annot':2,'numPages':1,'new':3} + +The above which is found by expert knowledge as the probable +used variables and functions in a benign javascript or other +object. Much of these functions is used in interactive PDF +documents, e.g. providing print buttons, + +A weight is added to each cleartext function/variable. After +counting the words in the document a summarized variable +named the inverted_feature_vector gives an integer. The +higher the integer, the higher the probability of the +javascript being benign. + +The inversed feature vector may be used as a signature and a +whitelist indication database may be built of datasets. In +the 100k malicious dataset the statistics showed that out of +42475, 41423 had more than one occurence of a known benign +variable. This might seem like a less good feature, but the +quantity is not the issue here, it is the weight of each +variable. So: One may say that the higher the inverse vector +is, the more likely it is that the PDF or javascript is +benign. To clarify, next table shows variables fragmented by +weight: Inverse vector separated by interval, the + +**Shadowserver 100k dataset** _The table shows that most +malicious PDF files in the 100k Shadowserver dataset +contains low-weighted scores when it comes to the inverted +vector as a measure of how benign the scripts are._ + + Weight interval Instances Instance percentage + <10 15232 35,6% + 20<>9 26852 62,8% + 30<>19 136 ~0% + 40<>29 148 ~0% + 50<>39 87 ~0% + 60<>49 28 ~0% + >60 253 ~0% + Total 42736 - + +The inversion vector may as well be seen as a measure of the +likeliness that the script is obfuscated. A quick look at +the table shows that the characteristics of obfuscation is +found in most PDF documents in the Shadowserver 100k +dataset. + +Even though this part of the vector should be seen as an +indication, analysts should be aware that threat agents may +adapt to the detection technique and insert clear text +variables such as the ones listed above in addition to their +malicious javascripts. This latter would function as a +primitive feature vector inversion jammer. In other words it +should be seen in context with the other items of the +javascript feature vector as well. Further, the concept +should be further evolved to avoid such evasion. One +technique to segment the code before analyzing it (giving +each code segment a score, finally generating a overall +probability score), making it more difficult for the threat +agent to utilize noise in his obfuscation. + +### D. Clustering + +Experience shows that in practically oriented environments +security analysis is, at least partially, done in a manual +manner. This saying that the detection is based on +indicators or anomalies and the analysis of the detection +results is performed manually by an analyst. Though this may +possibly be the approach resulting in least false positives +it is overwhelming in regard to analysis of all potentially +PDF documents in a larger organization. The 100k PDF dataset +used in this paper is a evidence of such. So, how is it +possible to automatically detect the interesting parts of +the 100k PDF dataset? This question leads to the concept of +data mining. + +The definition of data mining is the transformation of data +to "meaningful patterns and rules". + +Michael Abernethy at IBM developerWorks20 covers data mining quite extensively. + +#### D.1. A Narrow Experiment and Results + +In this paper the goal is to achieve an view of the dataset +in a way that is named "undirected" data mining: Trying to +find patterns or rules in existing data. This is achieved +through the feature vector previously presented. + +Up until now this paper has discussed how to generate an +satisfactionary feature vector and what makes the measure of +similarity. Let us do an experiment using WEKA (Waikato +Environment for Knowledge Analysis) for analyzing our +feature vector. + +Appendice 3 describes the ARFF format found from our feature +vector and two of the previously presented feature vectors +(SHA256: +``5a61a0d5b0edecfb58952572addc06f2de60fcb99a21988394926ced4bbc8d1b``, +``d3874cf113fa6b43e7f6e2c438bd500edea5cae7901e2bf921b9d0d2bf081201``) +and a random selection of 2587 parseable PDF-documents from +the dataset. + +In this experiement the feature vector were produced of 200 +random samples from the 100k dataset. Interesting in that +regard is that the subdataset loaded from originally +contained 6214 samples, while our application only handled +the decoding of under half. The feature vector was extracted +in a CSV format, converted by the following WEKA Java class +and loaded in WEKA: + + java -classpath /Applications/weka-3-6-6.app/Contents/Resources/Java/weka.jar weka.core.converters.CSVLoader dataset.csv + +In the WEKA preprocessing, the results may be visualized: + +![Fig. 6: Results 1; PDF Feature Vector DistributionA model + showing the PDF feature vector object distribution using + the 2587 parsable PDF + documents](/images/2015/02/Skjermbilde-2012-05-16-kl--13-17-20.png) + +### D.2. The complete dataset + +Next loading the complete feature vector dataset consisting +of 42736 entities showed interesting results when +clustering. + +![Fig. 7: Stringcount vs anomalies in the inverse + vector. Stringcount vs anomalies in the + inverse_vector. Using k-means algorithm and k=5. Medium + Jitter to emphasize the + clusters](/images/2015/02/Skjermbilde-2012-06-27-kl--11-40-19.png) + +The cluster process above also enables the possibility to +look at the anomalies where the inverse_vector is high. For +instance 9724 (the highest one in the Y-axis) the +inverse_vector is 21510 which is a very clear anomaly +compared to the rest of the clusters (the distance is +far). This should encourage a closer look at the file based +on the hash. + +The Shadowserver 100k ARFF dataset will be further evolved and may be found at the project GitHub page25. + +### E. Logging and Interpreting Errors + +Again and again while analyzing the 100k dataset the +interpreter went on parsing errors. Bad code one may say, +but a fact is that the threat agents are adapting their code +to evading known tools and frameworks. An example of this is +a recent bug21 in Stevens PDF parser where empty PDF objects +in fact created an exception in the application. + +So, what does this have to do with this paper? Creative +threat agents can never be avoided, creating malicious code +that avoids the detection routines. This makes an important +point, being that the application implemented should be +using strict deobfuscation and interpretation routines. When +an error occurs, which will happen sooner or later, the file +should be traceable and manually analyzed. This in turn +should lead to an adaption of the application. Where the +routines fails will also be a characteristic of the threat +agent: What part of the detection routines does he try to +evade? E.g. in the 100k dataset an error on the +ascii85-filter occurred. The parsing error made the +parser-module not to output a feature vector, and were +detected by error monitoring in log files. + +## Discussion and Conclusions + +In regard to being used standalone as evidence the feature +vector will have its limitations, especially since its hard +to connect it to an event it should be considered +circumstancial. + +The PDF and ECMA standard are complex and difficult to +interpret, especially when it comes to automation. As has +been shown in this article a really hard problem is +dynamically and generically executing javascripts for +deobfuscation. This is also shown just in the Adobe Reader, +where e.g. Adobe Reader X uses Spidermonkey 1.8, while +previous more prevalent versions use version 1.7 of +Spidermonkey. This often resulted in parsing errors, and +again it will potentially cause a larger error rate in the +next generation intrusion detection systems. + +It has been proved that a static analysis through a +Jsunpack-n modification recovers good enough round-zero +data, from a little less than half of the Shadowserver 100k +dataset, to generate a characteristic of each file. The +results were somewhat disappointing in regard to the +extensive parsing errors. Parsing optimalization and error +correction making the script more robust and reliable should +be covered in a separate report. Despite the latter a good +foundation and enough data were given to give a clue for +what to expect from the extended PDF feature vector. Also, +the inverse vector with its weighting gives a individual +score to each document, making it exceptionally promising +for further research. + +In regard to OLOVC a certain enhancement would be to combine +it with the work of Franke' and Petrovic' "Improving the +efficiency of digital forensic search by means of contrained +edit distance". Their concept seems quite promising and +might provide valuable input to OLOVC. + +The dataset used in this article may contain certain flaws +in its scientific foundation. No dataset flaws, but +indications that some data origins from the same source, has +been seen throughout this article. The reason is most +probably that the dataset was collected over three +continuous days. Linked to the behaviour of malware it is +known that certain malware such as drive-by attacks has +peaks in its spread as a function of time. It is therefore +natural to assume that there are larger occurences of PDF +documents originating from the same threat agent. On the +other side, in further research, this should be a measure of +the effectiveness of algorithms ability to group the data. + +The Shadowserver 100k dataset only contains distinct +files. It would be interesting to recollect a similar +dataset with non-distinct hash-entries, and to cluster it by +fuzzy hashing as well. + +Even though clustering is mentioned in the last part of this +article, further extensive research should be done to +completely explore the potential of using the current +feature vector. In other words the scope of the article +permitted for a manual selection of a feature vector and a +more or less defined measure of similarity though the +extended PDF feature vector. + +The project has a maintained GitHub page as introduced in +the last section. This page should encourage further +development into the extended PDF feature vector. + +If you'd like please have a look at the GuC Testimon Forensic Laboratory [21]. + + +[1] GuC Testimon Forensic Laboratory: https://sites.google.com/site/testimonlab/ diff --git a/data/osquery.md b/data/osquery.md new file mode 100644 index 0000000..bc53503 --- /dev/null +++ b/data/osquery.md @@ -0,0 +1,211 @@ +In another post I wrote about how telemetry is a challenge [1] of +a changing and more diverse and modern landscape. Recently I have +reviewed some device inventory and endpoint detection tools that +will add to the solution. In the future I will get back to my view +on Mozilla InvestiGator (MIG) [2], but this post will focus on a +telemetry collection tool that I have grown fond of: osquery [3]. + +osquery was originally developed by Facebook for the purpose of +[4]: + +> Maintaining real-time insight into the current state of your infrastructure[...] + +With osquery data is abstracted, in the operating system in which +the agent runs, to a SQL-based interface. It contains a +near-infinite amount of available data, which is perfect to a +network defender. osquery can even parse native sqlite-databases, +which there are lots of in macOS. It also works in a distributed +mode like GRR and MiG. In practical terms this means that queries +are distributed. On the other hand, events can be streamed as well +when considering operational security. + +![Example of the hardware_events table when plugging in and then detaching a Yubikey](/static/img/data/osquery_hardware_events.png) + +Since 2014 osquery has been open sourced and now has a large +community developing about every aspect of the tool. According to +the briefs that's online several major institutions, including +Facebook, now uses osquery in service networks. + +osquery is cross-platform, and now supports: Linux, FreeBSD, +Windows and macOS. That is also some of what separates it from its +alternatives, like sysmon. + +Posts about osquery that you should review before moving on: + +* Doug Wilson's excellent presentation on FIRST 2018 + (security-usage focused) [5] +* Managing osquery with Kolide (an osquery tls server) [6] +* Another post on applying osquery for security [7] +* Palantir on osquery [8] + +So that was a couple of links to get you started. The next section shows you how to quickly get a lab environment up and running. + +## Setup and Configuration + +### Prerequisites + +There's only two things that you need setup for the rest of this +article if you are on macOS, which can both be easily installed +using Homebrew [9]: + + brew install go yarn + +Also you need to configure your Go-path, which can basically be: + + echo "export GOPATH=$HOME/go" >> ~/.bash_profile + +### Server Setup + +Setup Docker image of Kolide Fleet [10]: + + mkdir -p $GOPATH/src/github.com/kolide + cd $GOPATH/src/github.com/kolide + git clone git@github.com:kolide/fleet.git + cd fleet + make deps && make generate && make + docker-compose up + +Populate the database: + + ./build/fleet prepare db + +You are now ready to boot up the web UI and API server: + + ./build/fleet serve --auth_jwt_key=3zqHl2cPa0tMmaCa9vPSEq6dcwN7oLbP + +Get enrollment secret and certificate from the Kolide UI at +``https://localhost:8080`` after doing the registration process. + +![Kolide enrollment](/static/img/data/kolide-enrollment.png) + +### Client Setup + +Make the API-token (enrollment secret) persistent at the +end-point: + + export {enrollment-secret} > /etc/osquery/enrollment.secret + +Define flags file in ``/private/var/osquery/osquery.flags``. This +one the client uses to apply the centralised tls logging method, +which is the API Kolide has implemented. It is also certificate +pinned, so all is good. + + --enroll_secret_path=/etc/osquery/enrollment.secret + --tls_server_certs=/etc/osquery/kolide.crt + --tls_hostname=localhost:8080 + --host_identifier=uuid + --enroll_tls_endpoint=/api/v1/osquery/enroll + --config_plugin=tls + --config_tls_endpoint=/api/v1/osquery/config + --config_tls_refresh=10 + --disable_distributed=false + --distributed_plugin=tls + --distributed_interval=10 + --distributed_tls_max_attempts=3 + --distributed_tls_read_endpoint=/api/v1/osquery/distributed/read + --distributed_tls_write_endpoint=/api/v1/osquery/distributed/write + --logger_plugin=tls + --logger_tls_endpoint=/api/v1/osquery/log + --logger_tls_period=10 + +You can start the osquery daemon on the client by using the +following command. At this point you should start thinking about +packaging, which is detailed in the osquery docs [11]. + + /usr/local/bin/osqueryd --disable_events=false --flagfile=/private/var/osquery/osquery.flags + +osquery also has an interactive mode if you would like to test the +local instance, based on a local configuration file: + + sudo osqueryi --disable_events=false --config_path=/etc/osquery/osquery.conf --config_path=/etc/osquery/osquery.conf + +To make the client persistent on macOS, use the following +documentation from osquery [12]. + +### Managing the Kolide Configuration + +For this part I found what worked best was using the Kolide CLI +client [13]: + + ./build/fleetctl config set --address https://localhost:8080 + ./build/fleetctl login + ./build/fleetctl apply -f ./options.yaml + +The ``options.yaml`` I used for testing was the following. This +setup also involves setting up the osquery File Integrity +Monitoring (FIM) [14], which I wasn't able to get working by the +patching curl command [15] in the docs. The config monitors +changes in files under ``/etc`` and a test directory at +``/var/tmp/filetest``. + + apiVersion: v1 + kind: options + spec: + config: + decorators: + load: + - SELECT uuid AS host_uuid FROM system_info; + - SELECT hostname AS hostname FROM system_info; + file_paths: + etc: + - /etc/%% + test: + - /var/tmp/filetest/%% + options: + disable_distributed: false + distributed_interval: 10 + distributed_plugin: tls + distributed_tls_max_attempts: 3 + distributed_tls_read_endpoint: /api/v1/osquery/distributed/read + distributed_tls_write_endpoint: /api/v1/osquery/distributed/write + logger_plugin: tls + logger_tls_endpoint: /api/v1/osquery/log + logger_tls_period: 10 + pack_delimiter: / + overrides: {} + +## Next Steps + +Through this article we've reviewed some of the basic capabilities +of osquery and also had a compact view on a lab-setup +demonstrating centralised logging, to Kolide, using the tls API of +osquery. + +A couple of things that I would have liked to see was support for +OpenBSD [16], Android and Ios [17]. + +The local setup obviously does not scale beyond your own +computer. I briefly toyed with the idea that this would be a +perfect fit for ingesting into a Hadoop environment, and not +surprising there's a nice starting point over at the Hortonworks +forums [18]. + +There's a lot of open source information on osquery. I also found +the Uptycs blog useful [19]. + +[1] https://secdiary.com/2018-02-25-telemetry.html +[2] https://mig.mozilla.org +[3] https://osquery.io +[4] https://code.fb.com/security/introducing-osquery/ +[5] +https://www.first.org/resources/papers/conf2018/Wilson-Doug_FIRST_20180629.pdf +[6] +https://blog.kolide.com/managing-osquery-with-kolide-launcher-and-fleet-b33b4536acb4 +[7] https://medium.com/@clong/osquery-for-security-part-2-2e03de4d3721 +[8] https://github.com/palantir/osquery-configuration +[9] https://brew.sh +[10] +https://blog.kolide.com/managing-osquery-with-kolide-launcher-and-fleet-b33b4536acb4 +[11] https://osquery.readthedocs.io/en/2.1.1/installation/custom-packages/ +[12] https://osquery.readthedocs.io/en/stable/installation/install-osx/ +[13] +https://github.com/kolide/fleet/blob/master/docs/cli/setup-guide.md +[14] +https://osquery.readthedocs.io/en/stable/deployment/file-integrity-monitoring/ +[15] +https://github.com/kolide/fleet/tree/master/docs/api#file-integrity-monitoring +[16] https://github.com/facebook/osquery/issues/4703 +[17] https://github.com/facebook/osquery/issues/2815 +[18] +https://community.hortonworks.com/articles/79842/ingesting-osquery-into-apache-phoenix-using-apache.html +[19] https://www.uptycs.com/blog diff --git a/data/privacy-report-2014.md b/data/privacy-report-2014.md new file mode 100644 index 0000000..95e783c --- /dev/null +++ b/data/privacy-report-2014.md @@ -0,0 +1,69 @@ +I read in a Norwegian news publication yesterday that [more +than 50% of Norwegians doesn't care about Internet and +network surveillance [1]. In the original 60 page report +(survey and report ordered by the Norwegian Data Protection +Authority), named Privacy 2014 - The Current State and +Trends ("Personvern 2014 - Tilstand og Trender"), 46% of the +1501 participants state that they've gotten more concerned +with privacy over the last 2-3 years. + +The follow up question that the survey presented was "How +much do you care about privacy?". In the 1997 version of the +survey 77% said they were "pretty engaged or very engaged" +in privacy, while in 2013 there's an increase to 87%. Not as +bad as the news publication wants it to be in other words. I +guess what is referred to is mentioned in the section "The +Chilling Effects in Norway", where more than half of the +respondents states they haven't changed online behaviour +after the revelations of the American surveillance +methodologies. I think this correlates to the next section +(below). Also, more than 45% state that they would have +continued as normal if Norway were to start a massive +surveillance campaign in collaboration with foreign +intelligence. + +I read one section where asked "how much control of your own +situation do you feel you have?". More than half of the +respondents answered themselves, and 33% the government. The +latter is pretty amazing in my opinion. It's obviously +yourself that is responsible for your own situation. Seen in +regard to that more than 78% wouldn't pay 20 bucks a month +for privacy in online services it's even better. + +The report also have it's own section dedicated to the +Snowden revelations. Pretty interesting that 53% responded +that they didn't care about the surveillance, it is +unproblematic or that it's just plain +necessary. Interesting, considering that it's another nation +state than Norway we're talking about here. I could have +understood it if it was our own government, but another +country? Anyways, that's the facts. + +One question that I perhaps miss in the survey is "have you +done anything to protect your online presence from +surveillance?". One of the alternatives could for instance +be: "I use end-to-end encryption, such as GPG". It was +obviously not that technical a survey, and I can respect +that - but at the same time I see that's where it have to +end at some point. Thinking if I was employed in another +type of occupation: I think people would have continued as +normal if we get a mass-surveillance state because you get +to a point of exhaustion due to the complexity of the +technology and lack of knowledge on how to actually protect +yourself. I also think that the hypothetical question of +awareness of a mass-surveillance state would have had more +chilling effects than people actually respond. The question +actually reminds me of the Iron Curtain period, thinking +that you are always surveilled. + +The survey can be read in full here [2] (Norwegian), and I +think it's pretty good and thorough on the current state of +privacy in Norway. The survey was delivered by Opinion +Perduco. The 1997 survey was delivered by Statistics Norway. + + +[1] http://translate.google.com/translate?sl=auto&tl=en&js=n&prev=_t&hl=en&ie=UTF-8&u=http%3A%2F%2Fwww.digi.no%2F926712%2Fhalvparten-gir-blaffen +[2] https://www.datatilsynet.no/Nyheter/2014/Personvern-2014-tilstand-og-trender-/ + + + diff --git a/data/relayd-multidomain.md b/data/relayd-multidomain.md new file mode 100644 index 0000000..0cfcb02 --- /dev/null +++ b/data/relayd-multidomain.md @@ -0,0 +1,134 @@ +While running a relayd service for a multi-domain instance +recently I quickly came into an issue with relayd routing. + +relayd(8) is the relay daemon in OpenBSD. + +I run two local services that I front with relayd: + +* service A +* service B + +These two I define in relayd.conf(5): + + ext_addr="" + honk_port="31337" + inks_port="31338" + table { 127.0.0.1 } + table { 127.0.0.1 } + +To make sure relayd logs sufficiently for traceability I apply the +following options: + + log state changes + log connection + +The next part of my relayd.conf is creating a configuration for +the relay service ("protocols are templates defining settings and rules for relays"): + + http protocol https { } + +For the service definition I make sure to add the remote address +and local address: + + match request header append "X-Forwarded-For" value "$REMOTE_ADDR" + match request header append "X-Forwarded-By" \ + value "$SERVER_ADDR:$SERVER_PORT" + +A further important logging configuration comes next, and I make +sure my relay logs the host, X-Forwarded-For, User-Agent, +Referer and url: + + match header log "Host" + match header log "X-Forwarded-For" + match header log "User-Agent" + match header log "Referer" + match url log + +For performance [1]: + + tcp { nodelay, sack, socket buffer 65536, backlog 100 } + +Next I disable vulnerable ciphers: + + tls no tlsv1.0 + tls no tlsv1.1 + tls tlsv1.2 + +Sadly tlsv1.3 is still in -current, so we will have to wait for +that. + +I configure keys like follows: + + tls ca cert "/etc/ssl/cert.pem" + tls keypair serviceA.domain + tls keypair serviceB.domain + +Finally we use the tables defined initially to route traffic to +the right internal service: + + match request header "Host" value "serviceA.domain" forward to + match request header "Host" value "serviceB.domain" forward to + +And that is it for the service definition. + +In addition we define the relay ("relays will forward traffic +between a client and a target server") like follows. The "protocol +https" is the junction between the two parts of the config. + + relay https_relay { + listen on $ext_addr port https tls + protocol https + + forward to port $honk_port check tcp + forward to port $inks_port check tcp + } + +The whole config: + +ext_addr="159.100.245.242" +honk_port="31337" +inks_port="31338" +table { 127.0.0.1 } +table { 127.0.0.1 } + +log state changes +log connection + +http protocol https { + match request header append "X-Forwarded-For" value "$REMOTE_ADDR" + match request header append "X-Forwarded-By" \ + value "$SERVER_ADDR:$SERVER_PORT" + match request header set "Connection" value "close" + + match header log "Host" + match header log "X-Forwarded-For" + match header log "User-Agent" + match header log "Referer" + match url log + + tcp { nodelay, socket buffer 65536, backlog 100 } + + tls no tlsv1.0 + tls no tlsv1.1 + tls tlsv1.2 + tls ca cert "/etc/ssl/cert.pem" + + tls keypair cybsec.network + tls keypair inks.cybsec.network + + match request header "Host" value "cybsec.network" forward to + match request header "Host" value "inks.cybsec.network" forward to +} + +relay https_relay { + listen on $ext_addr port https tls + protocol https + + forward to port $honk_port check tcp + forward to port $inks_port check tcp +} + + +[1] https://calomel.org/relayd.html + + diff --git a/data/remote-forensics.md b/data/remote-forensics.md new file mode 100644 index 0000000..62202a9 --- /dev/null +++ b/data/remote-forensics.md @@ -0,0 +1,159 @@ +Like everything else in information security, forensics is +constantly evolving. One matter of special interest for +practitioners is doing forensics on remote computers, not that +it's entirely new. + +The use-case is self-explanatory to those working in the field, +but for the beginners I'll give a brief introduction. + +When you get a case on your desk and it lights up as something +interesting, what do you do? Probably your first step is searching +for known malicious indicators in network logs. Finding something +interesting on some of the clients, let's say ten in this case, +you decide to put some more effort into explaining the nature of +the activity. None of the clients is nearby, multiple of them are +even on locations with 1Mbps upload speeds. + +The next phase would probably be a search in open sources, perhaps +turning out in support of something fishy going on. Now you'd like +to examine some of the client logs for known hashes and strings +you found, and the traditional way to go is acquiring disk and +memory images physically. Or is it? That would have easily taken +weeks for ten clients. In this case you are lucky and you have a +tool for performing remote forensics at hand. The tool was a major +roll-out for your organization after a larger breach. + +What's new in remote forensics is that the tools begin to get more +mature, and by that I would like to introduce two products of +which I find most relevant to the purpose: + +* Google Rapid Response (GRR) [1] +* Mandiant for Incident Response (MIR) [2] + +Actually I haven't put the latter option to the test (MIR supports +OpenIOC which is an advantage) - but I have chosen to take GRR +for a spin for some time now. There are also other tools which may +be of interest to you such as Sourcefire FireAmp which I've heard +performs well for end-point-protection. I've chosen to leave that +out this presentation since this is about a different +concept. Surprisingly the following will use GRR as a basis. + +For this post there are two prerequisites for you to follow in +which I highly recommend to get the feel with GRR: + +* Setup a GRR server [3]. In this post I've used the current beta + 3.0-2, running all services on the same machine, including the + web server and client roll-in interface. There is one install + script for the beloved Ubuntu here, but I couldn't get it easily + working on other systems. One exception is Debian which only + needed minor changes. If you have difficulties with the latter, + please give me a heads-up. +* Sacrifice one client (it won't brick a production system as far + as I can tell either though) to be monitored. You will find + binaries after packing the clients in the GRR Server setup. See + the screenshot below for details. The client will automatically + report in to the server. + +You can find the binaries by browsing from the home screen in the +GRR web GUI. Download and install the one of choice. + +A word warning before you read the rest of this post: The GRR +website ~~is~~ was a little messy and not entirely intuitive. I +found, after a lot of searching, that the best way to go about it +is reading the code usage examples in the web GUI, especially when +it comes to what Google named flows. Flows are little plugins in +GRR that may for instance help you task GRR to fetch a file on a +specific path. + +Notice the call spec. This can be transferred directly to the +iPython console. Before I started off I watched a couple of +presentations that Google have delivered at LISA. I think you +should too if you'd like to see where GRR is going and why it came +to be. The one here gives a thorough introduction on how Google +makes sure they are able to respond to breaches in their +infrastructure [4]. + +I would also like to recommend an presentation by Greg Castle on +BlackHat for reference [5]. For usage and examples Marley Jaffe +at Champlain College have put up a great paper. Have a look at the +exercises at the end of it. + +What is good with GRR is that it supports the most relevant +platforms: Linux, Windows and OS X. This is also fully supported +platforms at Google, so expect development to have a practical and +long-term perspective. + +While GRR is relevant, it is also fully open source, and +extensible. It's written in Python with all the niceness that +comes with it. GRR have direct memory access by custom built +drivers. You will find support for Volatility in there. Well they +forked it into a new project named Rekall which is more suited for +scale. Anyways it provides support for plugins such as Yara. + +If you are like me and got introduced to forensics through +academia, you will like that GRR builds on Sleuthkit through pytsk +for disk forensics (actually you may choose what layer you'd like +to stay on). When you've retrieved an item, I just love that it +gets placed in a virtual file system in GRR with complete +versioning. + +The virtual filesystem where all the stuff you've retrieved or +queried the client about is stored with versioning for you +pleasure. In addition to having a way-to-go console application +GRR provides a good web GUI which provides an intuitive way of +browsing about everything you can do in the console. I think the +console is where Google would like you to live though. + +An so I ended up on the grr_console which is a purpose-build +iPython shell, writing scripts for doing what I needed it to +do. Remember that call spec that I mentioned initially, here is +where it gets into play. Below you see an example using the +GetFile call spec (notice that the pathspec in the flow statement +says OS, this might as well have been ``REGISTRY`` or ``TSK``): + + + token = access_control.ACLToken(username="someone", reason="Why") + + flows=[] + path="/home/someone/nohup.out" + + for client in SearchClients('host:Webserver'): + id=client[0].client_id + o=flow.GRRFlow.StartFlow(client_id=str(id), + flow_name="GetFile", pathspec=rdfvalue.PathSpec(path=path, pathtype=rdfvalue.PathSpec.PathType.OS)) + flows.append(o) + + files=[] + while len(flows)>0: + for o in flows: + f=aff4.FACTORY.Open(o) + r = f.GetRunner() + if not r.IsRunning(): + fd=aff4.FACTORY.Open(str(id)+"/fs/os%s"%path, token=token) + files.append(str(fd.Read(10000))) + flows.remove(o) + +If interested in Mandiant IR (MIR) and its concept, I'd like to +recommend another Youtube video by Douglas Wilson, which is quite +awesome as well [7]. + +Update 2020: Today I wouldn't recommend MIR/FireEye HX, but rather +something like LimaCharlie [8] due to the lack of hunting +capabilities in the HX platform. + + +[1] https://github.com/google/grr + +[2] http://www.fireeye.com/products-and-solutions/endpoint-forensics.html + +[3] https://grr-doc.readthedocs.io/en/latest/installing-grr-server/index.html + +[4] https://2459d6dc103cb5933875-c0245c5c937c5dedcca3f1764ecc9b2f.ssl.cf2.rackcdn.com/lisa13/castle.mp4 + +[5] GRR: Find All The Badness - https://docs.google.com/file/d/0B1wsLqFoT7i2Z2pxM0wycS1lcjg/edit?pli=1 + +[6] Jaffe, Marley. GRR Capstone Final Paper + +[7] NoVA Hackers Doug Wilson - Lessons Learned from using OpenIOC: https://www.youtube.com/watch?v=L-J5DDG_SQ8 + +[8] https://www.limacharlie.io/ diff --git a/data/signals-feeds.md b/data/signals-feeds.md new file mode 100644 index 0000000..531e29e --- /dev/null +++ b/data/signals-feeds.md @@ -0,0 +1,219 @@ + +## Key Takeaways + +* It is possible to index and tag a high number of RSS, OTX and + Twitter articles on limited computational power in seconds +* Building logic around timestamps is complex +* Structuring the resulting data in a graph is meaningful. + +## Introduction + +Today I am sharing some details about one of the multi-year +projects I am running. The project motivation is: + +> To stay up to date on cyber security developments within days. + +I didn't want a realtime alerting service, but an analysis tool to +gather important fragments of data over time. These fragments +makes up the basis of my open source research. The curated +information usually ends up on a channel like an NNTP feed, +sometimes with added comments. + +My solution was to create a common interface to ingest and search +content from third party sources, Achieving this is difficult, and +requires some work, but I found it feasible. + +Going throught some basic research I found that much of what +happens on the web eventually ends up on one of the following +three places (e.g. a mention): + +1. OTX +2. Twitter +3. RSS + +After some work I found that there were two things important to me +in the first iteration: + +1. Being able to recognize the characteristics of the content +2. Knowing the publish time of the data + +The primary problem was thus to build a program that scales with a +large number of feeds. + +Going from there I built a prototype in Python, which I've now +matured into a more performant Golang version. What follows from +here is my experience from that work. + +The tested component list of the program I am currently running are: + +* Gofeed [1] +* Badger [2] +* Apache Janusgraph [3,4] +* Apache Cassandra [5] +* Go-Twitter [6] +* Alienvault OTX API [7] +* Araddon Dateparse [8] + +[1] https://github.com/mmcdole/gofeed +[2] https://github.com/dgraph-io/badger +[3] https://janusgraph.org +[4] https://docs.janusgraph.org/basics/gremlin/ +[5] https://cassandra.apache.org +[6] https://github.com/dghubble/go-twitter/twitter +[7] https://github.com/AlienVault-OTX/OTX-Go-SDK/src/otxapi +[8] https://github.com/araddon/dateparse + + + +## The Lesson of Guestimation: Not All Feeds Are Created Equal + +Timestamps is perhaps some of the more challenging things to +interpret in a crawler and search engine. RSS is a loose standard, +at least when it comes to implementation. This means that +timestamps may vary: localized, invalid per the RFC standards, +ambiguous, missing and so on. Much like the web otherwise. Luckily +without javascript. + +The goal is simply about recognizing what timestamp are the most +correct one. A feed may contain one form of timestamp, while a +website may indicate another one. To solve this I use and compare +two levels of timestamping: + +* The feed published, updated and all items individual timestamps +* The item and website last modified timestamps + +Looking back, solving the first level of timestamping was +straight forward. These timestamps are present in the feed and for +RSS the logic to build a list of timestamps would look like this: + + + /* First we check the timestamp of all + * feed items (including the primary). + * We then estimate what is the newest + * one */ + var feedElectedTime time.Time + var ts = make(map[string]string) + ts["published"] = feed.Published + ts["updated"] = feed.Updated + var i=0 + for _, item := range feed.Items { + ts[strconv.Itoa(i)] = item.Published + i++ + ts[strconv.Itoa(i)] = item.Updated + i++ + } + feedElectedTime, _, err = tsGuestimate(ts, link, false) + +The elected time can be used to compare with a previous feed +checkpoint to avoid downloading all items again. Using the above +logic I was also able to dramatically increase the success rate of +the program, since it requires a valid timestamp. The +`tsGuestimate` logic is something for a future post. + +Further the item/website timestamps requires a similar method, but in +addition I found it an advantage to do a HTTP HEAD request to the +destination URL to combine with the timestamps available from the +feed. The central and important aspect here is to abort retrieval +if an item already exists in the database, this is dramatically +increases the processing in each run. + +False timestamps are a problem. I noticed that websites publish +feeds with dynamic timestamps, which means that when you retrieve +the feed it adds the timestamp of now. This obviously creates +resource-intesive operations since the whole feed is then at risk +for re-indexing each run. + + +## Noise Reduction: Recognizing Content Characteristics + +Retrieving content is possible in several ways. For recognizing the +content I opted for and have success/good coverage using +regex. This is also some of the good things of curating articles, +since this means experience with questions such as "why did I miss +this article?" evolves into a new iteration of the program input. + +For instance, to stay on top of targeted cyber operations, I found +that much used phrases in articles was "targeted attack" and +"spear phishing". So based on that I deployed the following +keyword search (regular expression) which applies to every new +item ingested: + + "targeted":"(?i)targeted\\satt|spear\\sp", + +So a new article containing "targeted attack" in the body or title +is tagged with a hotword "targeted". Another hotword could be +"breach". + +Perhaps not surprising this data can be modelled in a graph like +follows. + + Tweet ─> URL in tweet ┌─> Targeted + └─> Breach + +## A Practical Example + +Traversing a news graph, we can go from the hotword "targeted", to +all items and articles for the past days linked to the hotword. + +I use Gremlin for querying. An example is shown below (some +details omitted): + + keyw="targeted" + _date="2021-02-10" + g.V().hasLabel('hotword').has('title',keyw).as("origin_hw"). + in().in().hasLabel('article:m').has('timestamp',gte(_date)).order().by('timestamp',asc).as('article'). + .select("origin_hw","article").by(values('title','timestamp')) + +The procedure above summarized: + +1. Find the node with the keyword "targeted" +2. Find all articles (for instance a tweet) that are two steps out + from the keyword (since these may be linked via a content node) +3. Get title and timestamp from hotword and tweet + +Using a match, which was incidentally not a tweet but an article, +from a RSS feed, we find the following: + + ==>{origin_hw=targeted, article=WINDOWS KERNEL ZERO-DAY EXPLOIT (CVE-2021-1732) IS USED BY BITTER APT IN TARGETED ATTACK} + +Retrieving the article with Gremlin, we can decide the source: + + gremlin > g.V().has('title','WINDOWS KERNEL ZERO-DAY EXPLOIT (CVE-2021-1732) IS USED BY BITTER APT IN TARGETED ATTACK').valueMap() + + + =>{link=[https://www.reddit.com/r/netsec/.rss], + title=[WINDOWS KERNEL ZERO-DAY EXPLOIT (CVE-2021-1732) IS USED BY BITTER APT IN TARGETED ATTACK], + src=[Reddit - NetSec], + src_type=[rss], + sha256=[8a285ce1b6d157f83d9469c06b6accaa514c794042ae7243056292d4ea245daf], + added=[2021-02-12 10:42:16.640587 +0100 CET], + timestamp=[2021-02-10 20:31:06 +0000 +0000], + version=[1]} + + ==>{link=[http://www.reddit.com/r/Malware/.rss], + title=[WINDOWS KERNEL ZERO-DAY EXPLOIT (CVE-2021-1732) IS USED BY BITTER APT IN TARGETED ATTACK], + src=[Reddit - Malware], + src_type=[rss], + sha256=[69737b754a7d9605d11aecff730ca3fc244c319f35174a7b37dd0d1846a823b7], + added=[2021-02-12 10:41:48.510538 +0100 CET], + timestamp=[2021-02-10 20:35:11 +0000 +0000], + version=[1]} + +In this instance the source was two Reddit posts which triggered +the keyword in question and others about a targeted incident in +China. Additionally this triggered a zero day hotword. + + +## Summary + +Through this post I have shown some key parts of how to build a +feed aggregator that can scale to thousands of feeds on a single +computer, with update times in seconds. + +I have also given a brief view on how Janusgraph and similar +systems can be used to model such data in a way which makes it +possible to search, find and eventually stay up to date on +relevant information to cyber security. + +When in place such a system may save hours per day since the data +is normalised and searchable in one place. diff --git a/data/ssh-ca-proxyjump.md b/data/ssh-ca-proxyjump.md new file mode 100644 index 0000000..af24dcc --- /dev/null +++ b/data/ssh-ca-proxyjump.md @@ -0,0 +1,228 @@ +## Key Takeaways + +* SSH has a key-signing concept that in combination with a + smartcard provides a lean, off-disk process +* A SSH-CA provides the possibility of managing access + without a central point of failure +* The use of SSH Jumphost is an easier way to tunnel + sessions end-to-end encrypted, while still maintaining + visibility and control through a central point + +## Introduction + +This post is an all-in-one capture of my recent discoveries with +SSH. It is an introduction for a technical audience. + +It turns out that SSH is ready for a zero trust and +microsegmentation approach, which is important for +management of servers. Everything described in this post is +available as open source software, but some parts require a +smartcard or two, such as a Yubikey (or a Nitrokey if you +prefer open source. I describe both). + +I also go into detail on how to configure the CA key without +letting the key touch the computer, which is an important +principle. + +The end-result should be a more an architecture providing a better +overview of the infrastructure and a second logon-factor +independent of phones and OATH. + +## SSH-CA + +My exploration started when I read a 2016-article by +Facebook engineering [1]. Surprised, but concerned with the +configuration overhead and reliability I set out to test the +SSH-CA concept. Two days later all my servers were on a new +architecture. + +SSH-CA works predictably like follows: + + [ User generates key on Yubikey ] + | + | + v + [ ssh-keygen generates CA key ] --------> [ signs pubkey of Yubikey ] + | - for a set of security zones + | - for users + | | + | | + | v + v pubkey cert is distributed to user + [ CA cert and zones pushed to servers ] - id_rsa-cert.pub + - auth_principals/root (root-everywhere) + - auth_principals/web (zone-web) + +The commands required in a nutshell: + + # on client + $ ssh-keygen -t rsa + + # on server + $ ssh-keygen -C CA -f ca + $ ssh-keygen -s ca -I -n zone-web -V +1w -z 1 id_ecdsa.pub + + # on client + cp id_ecdsa-cert.pub ~/.ssh/ + +Please refer to the next section for a best practice storage +of your private key. + +On the SSH server, add the following to the SSHD config: + + TrustedUserCAKeys /etc/ssh/ca.pub + AuthorizedPrincipalsFile /etc/ssh/auth_principals/%u + +What was conceptually new for me was principals and +authorization files per server. This is how it works: + +1. Add a security zone, like zone-web, during certificate + signing - "ssh-keygen * -n zone-web *". Local username does + not matter +2. Add a file per user on the SSH server, where zone-web + is added where applicable - + e.g. "/etc/ssh/auth_principals/some-user" contains "zone-web" +3. Login with the same user as given in the zone file - "ssh some-user@server" + +This is the same as applying a role instead of a name to the +authorization system, while something that IDs the user is +added to certificate and logged when used. + +This leaves us with a way better authorization and +authentication scheme than authorized_keys that everyone +uses. Read on to get the details for generating the CA key +securely. + + +## Keeping Private Keys Off-disk + +An important principle I have about private keys is to +rather cross-sign and encrypt two keys than to store one on +disk. This was challenged for the SSH-CA design. Luckily I found +an article describing the details of PKCS11 with ssh-keygen +[2]: + +> If you're using pkcs11 tokens to hold your ssh key, you +> may need to run ssh-keygen -D $PKCS11_MODULE_PATH +> ~/.ssh/id_rsa.pub so that you have a public key to +> sign. If your CA private key is being held in a pkcs11 +> token, you can use the -D parameter, in this case the -s +> parameter has to point to the public key of the CA. + +Yubikeys on macOS 11 (Big Sur) requires the yubico-piv-tool +to provide PKCS#11 drivers. It can be installed using +Homebrew: + + $ brew install yubico-piv-tool + $ PKCS11_MODULE_PATH=/usr/local/lib/libykcs11.dylib + +Similarly the procedure for Nitrokey are: + + $ brew cask install opensc + $ PKCS11_MODULE_PATH=/usr/local/lib/opensc-pkcs11.so + +Generating a key on-card for Yubikey: + + $ yubico-piv-tool -s 9a -a generate -o public.pem + +For the Nitrokey: + + $ pkcs11-tool -l --login-type so --keypairgen --key-type RSA:2048 + +Using the exported CA pubkey and the private key on-card a +certificate may now be signed and distributed to the user. + + $ ssh-keygen -D $PKCS11_MODULE_PATH -e > ca.pub + + $ ssh-keygen -D $PKCS11_MODULE_PATH -s ca.pub -I example -n zone-web -V +1w -z 1 id_rsa.pub + Enter PIN for 'OpenPGP card (User PIN)': + Signed user key .ssh/id_rsa-cert.pub: id "example" serial 1 for zone-web valid from 2020-10-13T15:09:00 to 2020-10-20T15:10:40 + +The same concept goes for a user smart-card, except that is +a plug and play as long as you have the gpg-agent +running. When the id_rsa-cert.pub (the signed certificate of +e.g. a Yubikey) is located in ~/.ssh, SSH will find the +corresponding private key automatically. The workflow will +be something along these lines: + + [ User smartcard ] -----------> [ CA smartcard ] + ^ id_rsa.pub | + | | signs + |------------------------------| + sends back id_rsa-cert.pub + + +## A Simple Bastion Host Setup + +The other thing I wanted to mention was the -J option of +ssh, ProxyJump. + +ProxyJump allows a user to confidentially, without risk of a +man-in-the-middle (MitM), to tunnel the session through a +central bastion host end-to-end encrypted. + +Having end-to-end encryption for an SSH proxy may seem +counter-intuitive since it cannot inspect the +content. I believe it is the better option due to: + +* It is a usability compromise, but also a security + compromise in case the bastion host is compromised. +* Network access and application authentication (and even + authorization) goes through a hardened point. +* In addition the end-point should also log what happens on + the server to a central syslog server. +* A bastion host should always be positioned in front of the + server segments, not on the infrastructure perimeter. + +A simple setup looks like the following: + + [ client ] ---> [ bastion host ] ---> [ server ] + + +Practically speaking a standalone command will look like +follows: + + ssh -J jump.example.com dest.example.com + + +An equivalent .ssh/config will look like: + + Host j.example.com + HostName j.example.com + User sshjump + Port 22 + + Host dest.example.com + HostName dest.example.com + ProxyJump j.example.com + User some-user + Port 22 + +With the above configuration the user can compress the +ProxyJump SSH-command to "ssh dest.example.com". + +## Further Work + +The basic design shown above requires one factor which is +probably not acceptable in larger companies: someone needs +to manually sign and rotate certificates. There are some +options mentioned in open sources, where it is normally to +avoid having certificates on clients and having an +authorization gateway with SSO. This does however introduce +a weakness in the chain. + +I am also interested in using SSH certificates on iOS, but +that has turned out to be unsupported in all apps I have +tested so far. It is however on the roadmap of Termius, +hopefully in the near-future. Follow updates on this subject +on my Honk thread about it [4]. + +For a smaller infrastructure like mine, I have found the +manual approach to be sufficient so far. + + +[1] Scalable and secure access with SSH: https://engineering.fb.com/security/scalable-and-secure-access-with-ssh/ +[2] Using a CA with SSH: https://www.lorier.net/docs/ssh-ca.html +[3] Using PIV for SSH through PKCS #11: +https://developers.yubico.com/PIV/Guides/SSH_with_PIV_and_PKCS11.html +[4] https://cybsec.network/u/tommy/h/q1g4YC31q45CT4SPK4 diff --git a/data/ssh-certs-apple-t2.md b/data/ssh-certs-apple-t2.md new file mode 100644 index 0000000..7784133 --- /dev/null +++ b/data/ssh-certs-apple-t2.md @@ -0,0 +1,94 @@ +## Key Takeaways + +* SSH certificates can be used with the Apple T2 chip on + macOS as an alternative to external smart cards, + authenticated with a fingerprint per session. +* The Mac T2 chip serves as an extra security layer by creating + private keys in the secure enclave. +* The CA can be stored on an external smartcard, only + signing for access in a limited period - again limiting + the exposure. + +## Introduction + +Over the past days I have been going down a deep, deep +rabbit hole of SSH proxy jumping and SSH certificates +combined with smart cards. + +After playing around with smart cards for SSH, I recognized +that not only external smart cards such as the Yubikey or +Nitrokey is a possible lane to go down. + +Mac computers comes with a security chip called T2. This chip is +also known to host something Apple calls Secure Enclave [1]. In +the Secure Enclave you can store keys. + +It will probably not serve as an equally secure solution as with +external smart cards, but it is a better balance for usability. + +The T2 is permanently stored in hardware on one host only, +so the access needs to be signed on a per-host basis. In +such I would say the T2 and external smart cards complement +each other. + +Always having the key available will bring two additional +vulnerabilities: + +* If compromised, the key is always available logically +* Separation of equipment and key is not possible e.g. in a + travel situation + +With a central pubkey directory tied to an identity +(automated), the T2 can be of better use for an enterprise +setup. + +## Setting up a Private Key in Secure Enclave + +While fiddling around I found sekey on Github [2]. The +project seems abandoned, but it is the secure enclave that +does the heavy lifting. + +The short and easy setup are: + + $ brew cask install sekey + $ echo "export SSH_AUTH_SOCK=$HOME/.sekey/ssh-agent.ssh" >> ~/.zshrc + $ echo "IdentityAgent ~/.sekey/ssh-agent.ssh" >> ~/.ssh/config + $ source ~/.zshrc + +A keypair can now be generated in the secure enclave by: + + $ sekey --generate-keypair SSH + $ sekey --list-keys + +Now export the public key of the curve generated on-chip: + + $ sekey --export-key > id_ecdsa.pub + +Using the trick we found in our recent venture into using +smart cards for signing the key, we can used PCKS#11 without +compromising security [3]. In this case I use a Nitrokey: + + $ brew cask install opensc + $ PKCS11_MODULE_PATH=/usr/local/lib/opensc-pkcs11.so + $ ssh-keygen -D $PKCS11_MODULE_PATH -e > ca.pub + $ ssh-keygen -D $PKCS11_MODULE_PATH -s ca.pub -I example -n zone-web -V +1h -z 1 id_ecdsa.pub + Enter PIN for 'OpenPGP card (User PIN)': + Signed user key id_ecdsa-cert.pub: id "example" serial 1 for zone-web valid from 2020-10-14T20:26:00 to 2020-10-14T21:27:51 + cp id_ecdsa-cert.pub ~/.ssh/ + +If you now try to ssh into a server using the given +certificate authority as shown in the SSH-CA post [3], +access should be granted with a fingerprint. + +## A Word of Caution + +The T2 has some vulnerabilities shown recently [4]. Make +sure to include these in your risk assessment of using +it. If you won't go down the smart card route it will still +be better than storing the key on disk. + + +[1] https://support.apple.com/guide/security/secure-enclave-overview-sec59b0b31ff/web +[2] https://github.com/sekey/sekey +[3] https://secdiary.com/2020-10-13-ssh-ca-proxyjump.html +[4] https://inks.cybsec.network/tag/t2 diff --git a/data/telemetry.md b/data/telemetry.md new file mode 100644 index 0000000..d781fdd --- /dev/null +++ b/data/telemetry.md @@ -0,0 +1,250 @@ +Telemetry for cyber security is currently at a +crossroads. While past methods have been efficient by being +based on network monitoring, the current revolution in +encryption and the distributed workspace makes it +insufficient to solely rely on network monitoring. Through +this post we are going to focus on the current challenges. + +> Telemetry is an electrical apparatus for measuring a +> quantity (such as pressure, speed, or temperature) and +> transmitting the result especially by radio to a distant +> station +> – Meriam Webster + +Telemetry, a term mostly used by AV-vendors, have become +broadly applied as services change from a central to +decentralised geographically spread. Yesterday an employee +would work at his desk from 9-5 and then go home, while +today's modern worker moves around the office area and can +basically work from anywhere in the world when they feel +like it. + +In cyber security, telemetry can generally be categorised +in: 1) Network-centric and 2) endpoint-based. A complete +telemetry profile is essential for being able to monitor +security events and to execute retrospective +analysis. Through my recent article on indicators [1] I +proposed a structure for indicators organised in three +levels of abstraction. In this article a telemetry profile +means something that covers a degree of these three levels. + + | Level of abstraction | | Formats + |-----------------------|----|------------- + | Behavior | | MITRE (PRE-)ATT&CK + |-----------------------|--->|------------- + | Derived | | Suricata+Lua, Yara + |-----------------------|--->|------------- + | Atomic | | OpenIOC 1.1 + + +## The Challenges + +There are generally two problems that needs to be fully +solved when collecting data for cyber security: + +* The use of encryption from end-to-end +* Workers and thereby the defended environment are or will be distributed + +As of February 2017 the web was 50% encrypted [2]. Today +that number [3] is growing close to 70%. + +For defense purposes, it is possible to identify malicous +traffic, such as beaconing, through metadata analysis. There +have been some developments on detecting anomalies in +encrypted content lately - namely the fingerprinting of +programs using SSL/TLS. In the future I believe this will be +the primary role of network-based detection. This is +actually a flashback to a pre-2010 monitoring environment +when full content was rarely stored and inspected by +security teams. + +An additional element to consider is the previous debate +about public key pinning, which has now evolved into +Expect-CT [4]. This means that man in the middle (MitM) +techniques is going to be a no-no at some point. Yes, that +includes your corporate proxy as well. + +There is one drawback and dealbreaker with the above for +security teams: it requires access to the datastream used by +the endpoints to be fully effective. + +VPNs are going away as more resilient and modern network +architectures will become dominating. The most promising +challenger at the moment is the Beyondcorp [5] (based on +zero trust) architecture proposed by Google more than six +years ago. A zero trust architecture means that clients will +only check in to the corporate environment at the points +that _they_ need or are in the vicinity of corporate +resources. Other activity, such as browsing on external +websites are actually no longer going via the corporate +infrastructure or its monitored links. Additionally, the +endpoint is easily the most common infiltration vector. + +To be honest, the Beyondcorp model reflects to a larger +extent how humans actually interact with computers. Humans +have never been confined to the perimeter of the enterprise +network. This may be some of the reason for organisations +being in a currently defeatable state as well. The only ones +to confine themselves to the enterprise network is +ironically the network defenders. + +> The only ones to confine themselves to the enterprise network is +> ironically the network defenders. + +The battle of controlling the technology evolution is not +completely lost though, it is a matter of changing the +mindset of where data or telemetry is collected. Yesterday +it was at the corporate proxy or in the corporate +environment - today it is on the endpoint and during the +connections to valuable resources. + +For endpoints, the primary challenges currently faced are: + +* Maintaining the integrity of locally stored and buffered data +* The availability and transport of data to a centralised logging instance +* Confidentiality of the data in transport or at rest +* Data source consistency for central correlation of information from several + host sources +* Raising the stakes on operational security in a cat and mouse + chase between intruders and defenders + +Remote logging is a subject that has gained much publicity +previously, so we are not going into depth about that here. + +### Existing Tooling For Endpoints + +This section was not originally a part of the scope of this +article, but I'd like to establish a baseline of parts of +the available tooling to handle the above issues. I also +believe it touches some of the endpoint challenges. + +For the purpose of this article, we define the following +well-known computer abstraction stack: + +1. Hardware +2. Operating System +3. Application + +Hardware verification and logging is currently a more or +less unexplored field, with primarily only one tool +available to my knowlege. That tool is Chipsec [6] which has +been of interest and integrated into the Google Rapid +Response (GRR) [7] project for some time. + +Operating system logs are well understood today, and many +organisations manages logging from the host operating system +properly. + +There are increasingly good event streaming and agent-based +systems available, such as LimaCharlie [8], Sysmon [9] and +Carbon Black [10]. The media focus of these platforms are on +the more trendy term "hunting", but their real purpose is +OS-level logging and pattern matching. + +Further, distributed forensic platforms are available from +FireEye (HX) and an open source equivalent from Google named +GRR. GRR have been featured extensively on this site +previously. Common for these are that they do not stream +events, but rather stores information on the endpoint. + +Application layer logging is extremely challenging. The +logging mechanism in this regard needs to be connected to +the structure of the application itself, and there are a lot +of applications. Further, many application developers does +not focus on logging. + +Application logging is important and could be seen as the +technical contextual information provided by the +endpoint. Exposed applications that are important in terms +of coverage: + +* Browsers +* Email Readers +* Application Firewalls (if you have one) +* Instant Messaging Clients +* Rich Document editors, such as Excel, Word, Powerpoint + +These applications are important since they are the first +point of contact for almost any technical threat. Done +right, application logs will be at a central location before +the intruder manages to get a foothold on the client. Thus, +the risk of data being misrepresented in the central system +are highly reduced (integrity). + +Taking browsers and Microsoft Office as an example, there +are some options readily available: + +* Firefox HTTP and DNS logging: mozilla.org [11] +* Office Telemetry logging: Office Telemetry Log [12] + +The above examples are not security focused as far as I +could tell, more often they are debug oriented. However, the +same data is often what we are after as well (such as: did +the document have a macro? or what is the HTTP header?). + +The dependency on the application developers to create +logging mechanisms is quite a challenge in this +arena. However, I believe the solutions in cases where +applications does not log sufficiently is to take advantage +of plugins. Most modern applications supports plugins to +some extent. + +To summarise the tooling discussion, we can populate the +computer abstraction layers with the mentioned tools. + + | Level of abstraction | | Tools + |-----------------------|----|------------- + | Application | | Browser, Email and so on + |-----------------------|--->|------------- + | Operating System | | LC, CB, Sysmon, + |-----------------------|--->|------------- + | Hardware | | Chipsec + +## Conclusions: How Do We Defend in The Future? + +In this article we have defined a structure and discussed in +short one of the most prominent challenges faced by +enterprise defenders today: how do we defend in the future? + +Technology. This is the point were technology alone is no +longer the sole solution to defending a network. Modern +network architectures means that defenders needs to be able +to fully comprehend and use the human nature as sensors. It +is also about building intuitive systems which makes the +necessary data and information available to the +defenders. In my mind technology has never been the sole +solution either, so the technology evolution is for the +greater good. + +It seems obvious and unavoidable to me that network +defenders must start looking outside the perimeter, just as +intruders have done for many years already. This means +adapting the toolsets available and lobbying for an +architecture that reflects how humans actually use +technology resources. Most people have owned private +equipment for many years (surprise), and the line between +employee and enterprise is blurred and confusing when +realitity now sinks in. + +This means, in the technology aspect, that an emphasis must +be put on the endpoints - and that network monitoring must +again be about the metadata of the activity. In short: +collect metadata from networks and content from endpoints. + +Only this way will we, in the future, be able to create a +full telemetry profile from each device under our +responsibility. + + +[1] Article on indicators: /indicators/ +[2] 50% encrypted: https://www.eff.org/deeplinks/2017/02/were-halfway-encrypting-entire-web +[3] that number: https://letsencrypt.org/stats/ +[4] Expect-CT: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Expect-CT +[5] Beyondcorp: https://cloud.google.com/beyondcorp/ +[6] Chipsec: https://github.com/chipsec/chipsec +[7] Google Rapid Response (GRR): https://github.com/google/grr-doc/blob/master/publications.adoc +[8] LimaCharlie: https://github.com/refractionPOINT/lce_doc/blob/master/README.md +[9] Sysmon: https://www.rsaconference.com/writable/presentations/file_upload/hta-w05-tracking_hackers_on_your_network_with_sysinternals_sysmon.pdf +[10] Carbon Black: http://the.report/assets/Advanced-Threat-Hunting-with-Carbon-Black.pdf +[11] mozilla.org: https://developer.mozilla.org/en-US/docs/Mozilla/Debugging/HTTP_logging +[12] Office Telemetry Log: https://msdn.microsoft.com/en-us/library/office/jj230106.aspx diff --git a/data/travel.md b/data/travel.md new file mode 100644 index 0000000..2724f40 --- /dev/null +++ b/data/travel.md @@ -0,0 +1,361 @@ +Travelling with electronic devices is a challenge, and this is +certainly the case if you do not have a travel program for your +employees, where you must tinker with a new setup on a case by +case basis. The complexity of the matter is though, even when it +comes to resources, as it requires full time attention. + +Some organisations choose to ignore the problem all together, +others again does not fully respect their own threat model. The +latter may be just as dangerous, as it may lead to a false sense +of security for the travellers. + +This article is about establishing a technical laptop setup that +can be re-used with ease. Thus, other operational and strategic +aspects are left out. The information presented evolves around +organisations, but might as well apply for a private travel of +exposed individuals. + +## Main Drivers + +With that out of the way: multiple overall factors are left for +consideration. The following factors are the main drivers and +equally important when developing a technical model of an abroad +operation. + +* Threat resiliency. Equipment on travel can really never be + secured well enough, but it can be hardened to the degree that a + threat actor needs to risk exposure to compromise it +* Usability for the traveller. Equipment that feels inconvenient + will be avoided by the traveller at some point +* Usability for the supporting organisation (both security and IT + operations). Such setups may require much time and attention to + develop and if there are an increasing number of travellers to + high risk areas the setup needs to scale +* Cost. A travel program is a balance between environment, + security and cost. If the cost and environmental impact + surpasses the value that needs to be secured, the travel program + misses some of its value. Critical infrastructure organisations + is a different ball game than other industries on this point. + +When it comes to threats, the most prominent one is the evil maid +infiltration vector - which is basically someone gaining physical +access to a computer. Motherboard recently published an article +on how a malicious party could add a backdoor to a Dell (example +used) laptop in less than 5 minutes [1]. + +Other examples of relevant techniques used against travellers are: +electronic eavesdropping using cell networks, physical monitoring +of hotel rooms (e.g. camera surveillance), malicious charging +stations and so on. More details on general infiltration +techniques can be found in the Mitre ATT&CK's "Initial Access" +category (each described on their Wiki [1,2]. + +## Conceptual Overview + +Now that we have reviewed the main drivers, the question is if you +can protect against the given threat model in an easily achievable +way. To assess that we will first have to a look at an conceptual +model for travel. Taking a top-down approach, the travel setup +will in most cases consist of two components: + +1. The devices used for travel +2. The server side infrastructure + +There are arguments for a standalone operation, but the legal +ramification and practical impact of sending an employee into a +hostile environment with anything but local encryption is risky at +best. To note: that is, if the user will actually produce or carry +anything of value. If not, a standalone setup may in some cases be +argued for. + +Tactical no-brainers when travelling are the following: + +1. The system should disclose as little as possible about the + traveller's pattern of activity and content +2. As little information as possible should be at rest on devices + at risk +3. It should come at a high cost to compromise the end-point both + for physical and technical exploitation +4. The equipment should never be connected to an organisation's + service infrastructure directly before, during or after travel +5. The system should not be obviously provocative to locals - + e.g. during airport inspections. + +As far as I have found, there are currently one desktop system +that sufficiently meet these criterions - and that is ChromeOS +which comes with sane default settings, has a really minimal +configuration and is usable to an average person. However, +ChromeOS is not a mobile operating system - and for that purpose +iOS and Android is a better fit even though they do not tick off +all the above boxes. + +With that in mind the following model, that I have named "The +Tactical Travel Protection Model", provides a hardened, basic +infrastructure setup that uses cloud providers to hide in plain +sight. + +![The Tactical Travel Protection Model shows the concept of a full stack travel +setup](/static/img/data/tactical_travel_protection_model.png) + +The model further detailed in the following section. + +## Scalability and Technical Implementation + +With the conceptual model shown in the last section, it is time to +dive into implementation in a practical situation. The beauty of +the model is its modularity, so a component - such as a cloud +server, can easily be put in a local and physically controlled +location. Thus, please consider the technologies mentioned as an +example - the power of the model comes to play when you start +switching things up. + +### Server Side Components + +Consider theavailability of external services in all parts of the +process. Ideally a travel device should store information only +outside the regional location of a traveller. Balance storage +with requirements of availability. An example of such is that an +enforced VPN connection may not always be available, which would +practically leave an SFTP link exposed or down. + +For the example technologies used in the model shown in the +previous section, following sections shows the use. + +#### Cloud Policy, Provisioning, Device and User Management + +The reason we really need to use a device management service is +the scalability of deployment. Using a standalone approach may +work and provide some additional security due to the independence +of each device, but it is inevitable in the long run if you handle +even a low amount of travels. + +In this case, especially due to using ChromeOS, G Suite is the +most straightforward choice. It is important to focus the solution +on managing devices when speaking of travels, not pushing +sensitive configuration files and so on. If encountering a +compromise of the G Suite administrative account - it is possible +to push threat actor-controlled applications and configurations to +devices. Due to this it is essential to clean out the management +domain or create a new, untraceable one once in a while. + +G Suite is a granular solution. Examples of recommended policies +are: enforced use of security tokens and the disabling of other +two factor authentication options, screen lock upon lid close and +so on. + +When testing G Suite and ChromeOS I figured that it is easiest to +provision VPN configuration files (``.onc``) and certificates +manually. For iOS the same goes with ``.mobileconfig``. Doing this +adds another protective layer. + +#### VPN + +For VPN, my experience is that the most reliable option is using +native supported VPN clients in the operating system used for +travel. In this case it is ChromeOS with OpenVPN and iOS with +IPSec. This adds a bit to the complexity as iOS does not support +OpenVPN which runs most reliably in some countries that censors +the Internet. However, ChromeOS does. The solution to this is +using two VPS nodes for tunneling traffic: + +1. OpenVPN service through ansible-openvpn-hardened [4] +2. IPSec service through [5]. Lenny Zeltser created a + deployment-guide on algo recently [6] + +Again: to reduce exposure through centrality, you should not +provision device-specific keys from central management +consoles. Also, make sure to use certificates by any service that +needs to connect to the Internet. + +**OpenVPN**: + +Configure according to the README on the +``ansible-openvpn-hardened`` Github page. When you deploy the +OpenVPN server, you will be left with a file named something like +``@.preregistration-pki-embedded.ovpn`` in +the ``fetched_credentials/`` directory. Just like Apple +has its ``mobileconfig`` format, the Chromium Project uses the +Open Network Configuration (ONC) [7]. In order to convert this +format to a working configuration file, use ovpn2onc.py [9] like +the following. + + python3 reference/convert.py --infile *-pki-embedded.ovpn --outfile vpn_configuration.onc --name my_vpn + +This results in a configuration file named +``vpn_configuration.onc``. ChromeOS will not give you any feedback +here, so make sure to read through everything to get it right the +first time. If you end up troubleshooting, I found that the +Chromium project do have some working examples [9]. Import +``vpn_configuration.onc`` in Chrome as shown in the next section. + +Due to the hardened setup, be particularly strict to configure +with an OS version according to the repo README. For instance +Debian 8.10 won't work. + +**Algo**: Has great docs as-is. + +#### SFTP + +An SFTP service is simple to manually deploy. However, when +scalability hardening matters it is best to automate the +deployment. Through testing available Ansible scripts I ended up +with Johan Meiring's ansible-sftp [10]. Again, the configuration +is self-explanatory. You should however note that public +keys should be put in a ``files/`` directory under +``ansible-sftp`` root. These can be generated with +``ssh-keygen``, the private keys needs to be stored somewhere else +for manual transfer to the laptop accessing it. + +Since this is a traveller setup you should seek to create a +disconnect between cloud drives and rather use local storage and +SFTP. Disable OneDrive in Office 365 Business and Google Drive in +G Suite. + +#### Deploying an Out-of-Band (OOB) Channel + +Communications is king and perhaps one of the most important +things you configure. + +I described using Matrix and Riot for OOB recently [11]. + +#### Security Keys + +Nowadays, strong authentication is so easy that everyone should +use it. In a hostile environment it is hygiene. Google uses +Yubikeys and Feitian tokens in their authentication services and +so should a traveller [12,13,14]. This eliminates some of the +uncertainty when authenticating against remote servers and is +something the traveller can keep on-body at all times. For this +setup not every service can maintain usability when using +tokens. Those services - such as a mounted SFTP share should use +certificates. + + +### Client Side Components + +So why a Chromebook? + +* Has a minimal configuration. Everything you do is in the + browser +* You get granular control through G Suite +* Based on the Linux-kernel, which means it is different from + Windows and may require some extra effort from a threat actor +* A lot of work has gone in to the user interface in ChromeOS, so + it will feel familiar and intuitive to users +* ChromeOS has a lot of security features built-in [15], such as: + Secure Boot, Security Key login and so on. + +G Suite will help you a little bit on the way when it comes to +configuration control. However, it requires some client-side +configuration. + +The client side consists of components. I chose to model these as +five layers: + +The Traveller. The most important asset on the travel is most +likely your human traveller. This asset will have some values +assigned to it, such as security keys, credentials and his own +knowledge. Anonymise information stored here. In other words, +make sure to use an identifier and not the travellers real name. + +Device and information. When selecting devices and putting +information on it you have entered the device and information +exposure layer. This will typically consist of all hardware +peripherals, such as cameras, and content such as calls made from +a handset. Other things to consider here for ChromeOS is deploying +PGP and its keys with Mailvelope and Office from Google Play +Store. + +Content. It was actually kind of interesting to model this from an +iOS and ChromeOS perspective, because ChromeOS keeps most of its +applications in the browser while iOS has native apps on line with +Chrome. This again means that the exposure surface of ChromeOS is +more uniform than on iOS. + +Native applications. This is the actual applications installed in +the operating system directly. For iOS this has larger exposure +with native applications for e.g. communications, while on +ChromeOS you will basically only install an SFTP plugin to the +file system and use Chrome for a travel. + +Transport. When travelling to a hostile environment, tunnel all +communications to and from the system as far as possible. Both iOS +and ChromeOS has sufficient mechanisms here as we reviewed in the +previous section. For encryption keys: + +1. Transfer encryption keys stored in the ``.p12`` file and the + configuration to the Chromebook +2. Install encryption keys in + ``chrome://settings/certificates``. Use the "Import and Bind" + option to install the certificate to TPM +2. Import the VPN configuration (ONC) in + ``chrome://net-internals/#chromeos`` + +That is basically it. + +## Conclusion + +The art of balancing threat resiliency, usability and cost is an +intriguing problem. + +The technology out there, presented in this article, is in no way +designed to survive in hostile environments when considering the +capabilities of nation state threat actors. Fundamental security +mechanisms are lacking in this regard, and only companies like +Microsoft, Google and Apple can provide the basis to change +those. We can however slow these actors down considerably. + +An important aspect to consider, in order to compensate for the +above weaknesses, is that organisations needs to handle these +problems on an operational and strategic level as well. + +Using cloud environments are a solid choice for travel. However, +when considering threat actors that are able to gain access to the +hosts of those environments they are not sufficient. To solve +this, the most valuable services may be moved in-house or to a +hardened cloud environment. End-to-end encryption is also required +when using cloud services, such as when using the included inbox +of G Suite. + +Please keep in mind that The Tactical Traveler Protection Model is +a core model. This article does not cover every aspect. An +example of such is encryption and protection of external +peripherals and memory devices and operational and strategic +considerations. + +Organisations have yet to prove a working model resilient to +capable adversaries. Hopefully this article will be a foundation +to discuss variations and weaknesses in the community. + + + +[1] https://motherboard.vice.com/en_us/article/a3q374/hacker-bios-firmware-backdoor-evil-maid-attack-laptop-5-minutes + +[2] https://mitre.github.io/attack-navigator/enterprise/ + +[3] https://attack.mitre.org/wiki/Initial_Access + +[4] https://github.com/bau-sec/ansible-openvpn-hardened + +[5] https://github.com/trailofbits/algo + +[6] https://zeltser.com/deploy-algo-vpn-digital-ocean/ + +[7] https://www.chromium.org/chromium-os/chromiumos-design-docs/open-network-configuration + +[8] https://gist.github.com/tommyskg/6d0eeecc5bab65a49d72f5b16e086976 + +[9] https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chromeos/test/data/network + +[10] https://github.com/johanmeiring/ansible-sftp + +[11] https://secdiary.com/2018-07-11-matrix.html + +[12] https://krebsonsecurity.com/2018/07/google-security-keys-neutralized-employee-phishing/ + +[13] https://www.yubico.com/product/yubikey-4-series/#yubikey-4c + +[14] https://ftsafe.com/onlinestore/product?id=3 + +[15] http://dhanus.mit.edu/docs/ChromeOSSecurity.pdf + diff --git a/data/vantage.md b/data/vantage.md new file mode 100644 index 0000000..c6cf8ce --- /dev/null +++ b/data/vantage.md @@ -0,0 +1,222 @@ +## Key Takeaways + +* Monitoring the technology infrastructure is a key element for + situational awareness in both security and IT operations. +* A 2020 infrastructure should use a modern application layer + reverse proxy such as Pomerium in front of all services. Leave + all clients outside. +* The threat landscape should be the focus when shaping a + defendable infrastructure. + +Disclaimer: If you have outsourced all your equipment +and information to "the cloud", this post is a sanity check of the +relationship with your vendor. The primary audience of this post +is everyone willing to invest in people and knowledge to provide a +best possible defense for their people and processes, and the +technology supporting them. + +## Introduction + +I cannot start to imagine how many times Sun Tzu must have been +quoted in board rooms around the world: + +> If you know the enemy and know yourself, you need not fear the +> result of a hundred battles. If you know yourself but not the +> enemy, for every victory gained you will also suffer a +> defeat. If you know neither the enemy nor yourself, you will +> succumb in every battle. + +However much repeated, the message has not come across. Why is +that? Because this is a hard problem to solve. It is in the +intersection between people as a culture and technology. + +If all used reverse proxies in a sensible way I would probably +have a lot less to do at work. Time and time again it turns out +that organisations do not have configuration control over their +applications and infrastructure, and the reverse proxy is a +central building block in gaining it. To an extent everything is +about logs and traceability when an incident occurs. + + + +## Beyondcorp and The Defendable Infrastructure + +The lucky part of this hard-to-solve problem is that Google has +already prescribed one good solution in its Beyondcorp whitepapers +[1]. + +But this was in some ways described in the Norwegian Armed Forces +before that in its five architecture principles for a defendable +infrastructure. These were published by its former Head of Section +Critical Infrastructure Protection Centre [2]: + +1. Monitor the network for situational awareness +2. A defender must be able to shape the battleground to have + freedom of movement and to limit the opponent's freedom of + movement +3. Update services to limit vulnerability exposure +4. Minimize the infrastructure to limit the attack + surface +5. Traceability is important to analyze what happened + +I know that Richard Bejtlich was an inspiration for the defendable +infrastructure principles, so the books written by him is relevant +[4,5]. + +Defendable infrastructure is a good term, and also used in a 2019 +Lockheed article which defines it well [3]: + +> Classical security engineering and architecture has been trying +> to solve the wrong problem. It is not sufficient to try to build +> hardened systems; instead we must build systems that are +> defendable. A system’s requirements, design, or test results can’t +> be declared as "secure." Rather, it is a combination of how the +> system is designed, built, operated, and defended that ultimately +> protects the system and its assets over time. Because adversaries +> adapt their own techniques based on changing objectives and +> opportunities, systems and enterprises must be actively defended. + +The development of these architecture principles happened before +2010, so the question remains how they apply in 2020. We may get +back to the other principles in later posts, but the rest of this +article will focus on monitoring in a 2020-perspective. + +## Monitoring - a Central Vantage Point + +One thing that has developed since 2010 is our understanding of +positioning monitoring capabilities and the more mainstream +possibility of detection on endpoints. The historical focus of +mature teams was primarily on the network layer. While the network +layer is still important as an objective point of observation the +application layer has received more attention. The reason for it +is the acceptance that often it is were exploitation happens and +the capabilities as commercial products has emerged. + +With that in mind a shift in the understanding of a best practice +of positioning reverse proxies has occured as well. While the +previous recommendation was to think: defend inside-out. The focus +is now to defend outside-in. + +The meaning of defending outside-in, is to take control of what +can be controlled: the application infrastructure. In all +practicality this means to position the reverse proxy in front of +your server segment instead of the whole network, including +clients. + + + [ Application A ] + [ Client on-prem ] | + ] ---> [ Reverse proxy ] ---> [ App gateway ] + [ Client abroad ] ^ | + risk assessment [ Application B ] + + +Previously, by some reason, we put the "client on-prem" on the +other side of the reverse proxy, because we believed we could +control what the user was doing. Today, we know better. This is +not a trust issue, it is a matter of prioritizing based on the +asset value and the defending capacity. + +A reverse proxy is also a central vantage point of your +infrastructure. In a nutshell if you are good detecting security +incidents at this point, you are in a good position to have +freedom of movement - such as channeling your opponent. + +The modern reverse proxy have two integration capabilitites that +legacy proxies do not: + +* Single sign-on (SSO), which provides strong authentication and + good identity management +* Access control logic (Google calls this the access control + engine) + +In fact, Google in 2013 stated it uses 120 variables for a risk +assessment in its access control logic for Gmail [6]. In +comparison most organisations today use three: username, password +and in half the instances a token. + +> Every time you sign in to Google, whether via your web browser +> once a month or an email program that checks for new mail every +> five minutes, our system performs a complex risk analysis to +> determine how likely it is that the sign-in really comes from +> you. In fact, there are more than 120 variables that can factor +> into how a decision is made. + +I imagine that Google uses the following factors for comparison to +the sole username/password approach (they state some of these in +their article): + +- Geo-location with an algoritmic score of destination of last + login to current location was part of this. The k-means distance + could be a good fit. +- Source ASN risk score +- Asset subject to access +- User role scored against asset subject to access +- Device state (updated, antivirus installed and so on) +- Previous usage patterns, like time of day +- Other information about the behavioural patterns of relevant threats + +Another nice feature of a reverse proxy setup this way is that it +minimizes the exposure and gives defenders the possibility to +route traffic the way they see fit. For instance, it would be hard +for an attacker to differentiate between a honeypot and a +production system in the first place. One could also challenge the +user in cases where in doubt, instead of plainly denying access as +is sometimes done. + +One challenge is what protocols need support. The two clear ones +are: + +* HTTP +* SSH +* Application gateways between micro-segments + +I have scoped out the details of micro-segmentation from this +post. Micro-segmentation is the basic idea of creating a fine mesh +of network segments in the infrastructure so that no asset can +communicate with another by default. The rest is then routed +through e.g. a gateway such as Pomerium, or in high-performance +cases an application gateway - which may be a gateway for a +specific binary protocol. The reason is control of all activity +between services, being able to shape and deny access in the +terrain. + +Even though this post is not about implementation I will leave you +with some examples of good open source starting points: Pomerium +is an reverse proxy with the SSO-capability, and the default +capabilities of SSH takes you far (ssh-ca and JumpHost). + + -----------> [ syslog server ] <------------ + | | | + | | | + o | | | + /|\ [ Client ] -------> [ example.com ] <-----> [ app001.example.com ] + / \ | https - pomerium | + | | - SSH JumpHost | + | | | + | | | + [ HIDS ] |-------------------> [ NIDS ] + + Figure 1: Conceptual Defendable Infrastructure Overview + + +Now that a checkpoint is establish in front of the infrastructure, +the rest is a matter of traceability, taking the time to +understand the data to gain insight and finally develop and +implement tactics against your opponents. + + +Until next time. + + +[1] https://cloud.google.com/beyondcorp +[2] +https://norcydef.blogspot.com/2013/03/tg13-forsvarbar-informasjonsinfrastrukt.html +[3] +https://www.lockheedmartin.com/content/dam/lockheed-martin/rms/documents/cyber/LM-White-Paper-Defendable-Architectures.pdf +[4] Tao of Network Security Monitoring, The: Beyond Intrusion +Detection +[5] Extrusion Detection: Security Monitoring for Internal +Intrusions +[6] +https://blog.google/topics/safety-security/an-update-on-our-war-against-account/ diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..6c636ad --- /dev/null +++ b/flake.lock @@ -0,0 +1,77 @@ +{ + "nodes": { + "cl-nix-lite": { + "locked": { + "lastModified": 1721009305, + "narHash": "sha256-GtVd8VmPZB+J64VCf26yLbFUFRT1mdpzC8ylAHMIJoo=", + "owner": "hraban", + "repo": "cl-nix-lite", + "rev": "dc2793ec716b294739dabd6d99cc61543e6cd149", + "type": "github" + }, + "original": { + "owner": "hraban", + "repo": "cl-nix-lite", + "type": "github" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1710146030, + "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1722791413, + "narHash": "sha256-rCTrlCWvHzMCNcKxPE3Z/mMK2gDZ+BvvpEVyRM4tKmU=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "8b5b6723aca5a51edf075936439d9cd3947b7b2c", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-24.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "cl-nix-lite": "cl-nix-lite", + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix index ebe348c..0a099a0 100644 --- a/flake.nix +++ b/flake.nix @@ -13,17 +13,57 @@ pkgs = nixpkgs.legacyPackages.${system}.extend cl-nix-lite.overlays.default; in { - - defaultPackage.x86_64-linux = - # Notice the reference to nixpkgs here. - with import nixpkgs { system = "x86_64-linux"; }; - stdenv.mkDerivation { - name = "hello"; - src = self; - buildPhase = "gcc -o hello ./hello.c"; - installPhase = "mkdir -p $out/bin; install -t $out/bin hello"; + packages = { + ecl = with pkgs.lispPackagesLiteFor pkgs.ecl; lispDerivation { + name = "thoughts"; + lispSystem = "thoughts"; + lispDependencies = [ + asdf + arrow-macros + ]; + src = pkgs.lib.cleanSource ./generator.lisp; + meta = { + license = pkgs.lib.licenses.agpl3Only; }; + buildInputs = [ + pkgs.ecl + pkgs.git + pkgs.gnumake + pkgs.asdf + pkgs.multimarkdown + ]; + + phases = [ "unpackPhase" "installPhase" "cleanupPhase" ]; + + unpackPhase = '' + mkdir -p $TMPDIR + cp ${./generator.lisp} $TMPDIR/generator.lisp + mkdir -p $TMPDIR/data + cp -r ${toString ./data}/* $TMPDIR/data/ + mkdir -p $TMPDIR/templates + cp -r ${toString ./templates}/* $TMPDIR/templates/ + mkdir -p $TMPDIR/static + cp -r ${toString ./static}/* $TMPDIR/static/ + ''; + + installPhase = '' + mkdir -p $out/html + mkdir -p $out/gemini + mkdir -p $TMPDIR/output/gemini/articles + mkdir -p $TMPDIR/output/html + mkdir -p $TMPDIR/temp/data + cd $TMPDIR + ecl --load $TMPDIR/generator.lisp + cp -r $TMPDIR/output/html/* $out/html/ + cp -r $TMPDIR/output/gemini/* $out/gemini/ + cp -r $TMPDIR $out/tmpdir + ''; + + cleanupPhase = '' + rm -rf $TMPDIR/temp + ''; + }; }; devShell = pkgs.mkShell {