commit 35755f0b05f7cf659ac74dbb22eda197c6ddcded
Author: 45mg <45mm.cartridge421@slmail.me>
Date:   Sat Apr 20 09:49:30 2024 +0000

    init

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c39433e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,36 @@
+This is a set of rough scripts that I used to help me convert my Logseq graph to org-roam. Use at your own risk, and please read the code before you use it.
+
+As I no longer have any use for this code (having finished converting my Logseq graph), I will not be maintaining it in any way. If you need any improvements, you'll have to fork the repo and work on them yourself.
+
+# Supported
+- conversion of Logseq's weird markdown dialect to org-mode: pandoc does most of the actual conversion, but various scripts are needed to massage Logseq's markdown into something it can understand
+- converting links: Logseq links/tags are converted to org-roam links; page aliases are supported
+
+# Not Supported
+- queries and embeds
+- images and other file assets
+- journals (see Instructions below)
+- many other things i didn't think of, no doubt
+
+# Requirements
+Tested with:
+
+```
+pandoc 3.1.9
+emacs 29.1
+```
+In theory newer versions should work.
+
+# Instructions
+- Clone this repo.
+- Backup your existing logseq graph folder, just in case.
+- These scripts are designed to run on the 'pages' folder of your graph; decide what you're going to do with your journals. I combined them all into a single page like this:
+``` sh
+for file in journals/*; do
+    cat "$file" >> pages/journals.md
+done
+```
+- run the shell script `logseq-migration` on your graph's `pages` folder:
+  `path/to/this/repo/logseq-migration pages`
+  This will create a folder named `pages_` containing converted org-mode files; see the comments in `logseq-migration` for details.
+- in Emacs, run the code in `logseq-migration.el`. This will convert links and do some other post-processing.
diff --git a/add_title b/add_title
new file mode 100755
index 0000000..cb60f8d
--- /dev/null
+++ b/add_title
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+graphdir=$1
+pagepath=$(sed 's/^\///' <<<"${2#"$graphdir"}")
+title=${pagepath%.org}
+
+echo "#+title: $title" > "$2".temp
+cat "$2" >> "$2".temp
+mv "$2".temp "$2"
diff --git a/backtick b/backtick
new file mode 100755
index 0000000..ea46c9f
--- /dev/null
+++ b/backtick
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Enclose Logseq block ids and block embeds in backticks.
+# This is so pandoc will turn them into Org-mode 'verbatim' notation (eg.
+# =foo=), which we can then process with elisp.
+
+# set -x
+IFS=$'\n'
+
+# Match Logseq IDs (extended regex)
+id_eregex='[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}'
+
+# Match the targets of embed/query syntax (extended regex)
+target_eregex='([0-9a-f]|\(|\)|\[|\]|-)+'
+
+# block ids
+sed -E -i 's/id:: '"$id_eregex"'/`\0`/' "$1"
+# embeds
+sed -E -i 's/\{\{ *embed '"$target_eregex"' *\}\}/`\0`/' "$1"
+# queries
+sed -i 's/{{ *query.*}}/`\0`/' "$1"
diff --git a/convert b/convert
new file mode 100755
index 0000000..db49ab0
--- /dev/null
+++ b/convert
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+DIR=~/bin/scripts/logseq-migration
+
+find "${1:-.}" -wholename "*.md" -exec "$DIR"/pandoc-cmd '{}' \; -exec rm '{}' \;
diff --git a/convert-links.el b/convert-links.el
new file mode 100755
index 0000000..28236e3
--- /dev/null
+++ b/convert-links.el
@@ -0,0 +1,106 @@
+;;; convert-links.el --- convert logseq links to org-id links -*- lexical-binding: t; -*-
+
+(setq logseq/id-regexp "[0-9a-f]\\{8\\}-\\(?:[0-9a-f]\\{4\\}-\\)\\{3\\}[0-9a-f]\\{12\\}")
+(setq logseq/id-spec-regexp (concat "=id:: " logseq/id-regexp "=\\( \\|\n\\)"))
+(setq logseq/filelink-target-regexp "[^]]*")
+(setq logseq/alias-regexp "^alias:: \\(.*?\\)$")
+
+(defun logseq/--get-id-part (match)
+  ;; check if it's an id link
+  (if (string-match logseq/id-regexp match)
+      (substring-no-properties (match-string 0 match))
+    ;; if not, check if it's a file link
+    (when (string-match logseq/filelink-regexp match)
+      (substring-no-properties (match-string 1 match)))))
+
+(defun logseq/--convert-id-links-in-file (file hmap)
+  "Generate org-ids in FILE, and return the association with their contexts.
+First, convert the file itself into an org-roam node; then, remove logseq
+'id::'s and convert the headlines they apply to into org-roam nodes (by
+assigning an org id).
+Add the associations to HMAP. For file nodes, associate the page title with
+the node's id. For headline nodes, associate the replaced logseq id with the
+node id."
+  (find-file file)
+  (goto-char 0)
+  ;; convert the file itself into a node
+  (re-search-forward "#\\+title: \\(.*\\)$")
+  (let* ((title (substring-no-properties (match-string 1)))
+         (id (org-id-get-create)))
+    (puthash title id hmap)
+    (if (re-search-forward logseq/alias-regexp nil t)
+        (let ((aliases (split-string (match-string-no-properties 1) ", *")))
+          (message "%s" aliases)
+          (dolist (alias aliases) (puthash alias id hmap))))
+    ;; search for logseq ids
+    (while (re-search-forward logseq/id-spec-regexp nil t)
+      (let ((match (substring-no-properties (match-string 0))))
+        ;; delete logseq id
+        (replace-match "" nil nil)
+        (let ((id-part (logseq/--get-id-part match)))
+          ;; key is the old logseq id; value is newly created org id for this entry
+          (puthash id-part (org-id-get-create) hmap)))))
+  (save-buffer)
+  hmap)
+
+(defun logseq/convert-id-links (graphdir)
+  ;; (eq "a" "a") -> nil; (eql "a" "a") -> nil; (equal "a" "a") -> t
+  (let ((hmap (make-hash-table :test 'equal)))
+    (dolist (file (directory-files-recursively graphdir "org$"))
+      (logseq/--convert-id-links-in-file file hmap))
+    hmap))
+
+(setq logseq/embed-regexp (concat "={{ *\\(embed\\) " logseq/id-regexp " *}}="))
+(setq logseq/query-regexp (concat "={{ *\\(query\\) " logseq/id-regexp " *}}="))
+(setq logseq/link-regexp (concat "\\[\\[file:((" logseq/id-regexp "))\\]\\[\\(.*\\)\\]\\]"))
+(setq logseq/filelink-regexp
+      (concat "\\[\\[file:\\(" logseq/filelink-target-regexp "\\)\\]\\]"))
+(setq logseq/blockref-regexp (concat "((" logseq/id-regexp "))"))
+
+(defun logseq/--replace-with-org-links-in-file (file hmap)
+  "Replace Logseq link syntax in FILE with org-id links, based on the
+associations in HMAP."
+  (find-file file)
+  (goto-char 0)
+  (while (or (re-search-forward logseq/embed-regexp nil t)
+             (re-search-forward logseq/query-regexp nil t)
+             (re-search-forward logseq/link-regexp nil t)
+             (re-search-forward logseq/filelink-regexp nil t)
+             (re-search-forward logseq/blockref-regexp nil t))
+    (let* ((match (substring-no-properties (match-string 0)))
+           (match-data (match-data))
+           (id-part (logseq/--get-id-part match))
+           (node-struct (org-roam-node-from-id
+                         (gethash id-part hmap))))
+      (set-match-data match-data)
+      (when node-struct
+        ;; HACK: prevent org-roam-node-insert from reading a node from
+        ;; user; instead just use our node
+        (replace-match "" nil nil)
+        (cl-letf (((symbol-function 'org-roam-node-read)
+                   (lambda (&rest args) node-struct)))
+          ;; ((symbol-function 'org-roam-node-formatted)
+          ;; (lambda (node) (org-roam-node-title node))))
+          (org-roam-node-insert)))
+      (set-match-data match-data)
+      (unless node-struct
+        (replace-match (substring-no-properties
+                        (or (match-string 1) "")) nil nil)
+        (message (format "No node found for %s" match))))
+    (goto-char 0))
+  (save-buffer))
+
+(defun logseq/replace-with-org-links (graphdir hmap)
+    (dolist (file (directory-files-recursively graphdir "org$"))
+      (logseq/--replace-with-org-links-in-file file hmap)))
+
+;; (let* ((file "~/scratchdir/logseq_main_/pages/linux/arch/installation.org")
+;;        (table (logseq/--convert-id-links-in-file file (make-hash-table))))
+;;   (org-roam-db-sync t)
+;;   (logseq/--replace-with-org-links-in-file file table))
+
+(defun logseq/convert-links (graphdir)
+  (let ((table (logseq/convert-id-links graphdir))
+        (org-roam-directory graphdir))
+    (org-roam-db-sync t)
+    (logseq/replace-with-org-links graphdir table)))
diff --git a/delete b/delete
new file mode 100755
index 0000000..812e1de
--- /dev/null
+++ b/delete
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+# Delete 'collapsed:: '
+sed -i 's/collapsed:: .*//' "$1"
diff --git a/headings b/headings
new file mode 100755
index 0000000..bcc9a7e
--- /dev/null
+++ b/headings
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Convert nested lists to #, ##, ###... so that pandoc's Org converter will turn
+# them into nested headings.
+
+perl -pi -e '
+# Add an additional layer of nesting - all text in org files should be under a
+# heading
+s/^/	/;
+# Discard any existing Markdown header syntax ("#" characters after the list bullet)
+s/(	*- )#* /$1/;
+# For each level of indentation, add a "#"
+s/	(?=	*-)/#/g;
+# Finally, remove list bullets
+s/^(#*)-/$1/;
+# Remove any Tab characters remaining (eg. they will still be present in code
+# blocks)
+s/^	+//' "$1"
+
+# Add newlines between headings - apparently this is required by Markdown syntax
+sed -i -E 's/^#/\'$'\n#/' "$1"
diff --git a/links b/links
new file mode 100755
index 0000000..65fffe9
--- /dev/null
+++ b/links
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Convert Logseq link syntax into standard Markdown link syntax.
+
+# Changing the path of links to the assets folder - if you don't know why you'd
+# need this, then comment it out
+ sed -i 's/\[\.\.\/assets/\[assets/g' "$1"
+
+# Convert '[[pagename]]' links
+sed -E -i 's/\[\[(.*)\]\]/[\1](\1)/g' "$1"
+
+# Convert '#pagename' links
+sed -E -i '/```/,/```/ !s/ #([^ ]+)/ [\1](\1)/g' "$1"
+# ('!' inverts range - everywhere except in code blocks. We need this because
+# code comments may begin with '#')
diff --git a/logseq-migration b/logseq-migration
new file mode 100755
index 0000000..f7e5876
--- /dev/null
+++ b/logseq-migration
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Main script, used to convert all the files in a directory to org files. Call
+# it with the directory as argument: for example, `./logseq-migration pages`.
+# This will create a copy of `pages` called `pages_`, and the conversion will be
+# applied to all files in `pages`. Then it will copy `pages_` to `pages__`,
+# which will serve as a backup for when you do further processing on `pages_`.
+
+export scriptdir=~/bin/scripts/logseq-migration
+
+rm -r "$1"_ "$1"__
+cp -r "$1" "$1"_
+"$scriptdir"/preproc "$1"_
+"$scriptdir"/convert "$1"_
+"$scriptdir"/postproc "$1"_
+cp -r "$1"_ "$1"__
diff --git a/logseq-migration.el b/logseq-migration.el
new file mode 100644
index 0000000..915bb07
--- /dev/null
+++ b/logseq-migration.el
@@ -0,0 +1,10 @@
+;;; logseq-migration.el --- elisp processing of converted logseq graph -*- lexical-binding: t; -*-
+
+(load-file "convert-links.el")
+(load-file "remove-custom-ids.el")
+(load-file "remove-logseq-property-entries.el")
+(let ((graphdir "path/to/my/graph/pages_")) ;; change as needed
+  (logseq/remove-custom-ids graphdir)
+  (logseq/convert-links graphdir)
+  (logseq/remove-logseq-property-entries graphdir))
+  ;; comment out the below line if you don't want to delete logseq properties
diff --git a/namespaces b/namespaces
new file mode 100755
index 0000000..cbaed6c
--- /dev/null
+++ b/namespaces
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Represent Logseq's 'namespaces' by moving the page files into directories.
+# For example, a page named 'a/b/c' in Logseq, whose file is named 'a___b___c',
+# will stored as 'c' in the path 'a/b'.
+
+# if the page is not under a namespace, will return the same filename
+path=$(sed 's/___/\//g' <<<"$1")
+
+dir=${path%/*.*}
+
+# echo $PWD/$dir
+# echo $PWD/$path
+mkdir -p "$PWD/$dir"
+
+old=$PWD/$1
+new=$PWD/$path
+
+# to avoid same-file errors, run `mv` only if the file would actually be moved
+# (if it was not under a namespace, $old and $new are the same thing)
+[[ $(realpath "$old") = "$(realpath "$new")" ]] || mv "$old" "$new"
diff --git a/pandoc-cmd b/pandoc-cmd
new file mode 100755
index 0000000..cbc4e85
--- /dev/null
+++ b/pandoc-cmd
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Call pandoc on a .md file to produce a .org file
+
+pandoc --wrap=none -f markdown -t org -o "${1%.*}.org" "$1"
diff --git a/postproc b/postproc
new file mode 100755
index 0000000..6fd7e4c
--- /dev/null
+++ b/postproc
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+scriptdir=~/bin/scripts/logseq-migration
+
+find "${1:-.}" -wholename "*.org" -exec "$scriptdir"/add_title "$1" '{}' \;
diff --git a/preproc b/preproc
new file mode 100755
index 0000000..1f5c5d4
--- /dev/null
+++ b/preproc
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Overall preprocessing of Logseq markdown before converting to org
+
+scriptdir=~/bin/scripts/logseq-migration
+
+find "${1:-.}" -wholename "*.md" -exec "$scriptdir"/properties '{}' \; -exec "$scriptdir"/backtick '{}' \; -exec "$scriptdir"/delete '{}' \; -exec "$scriptdir"/headings '{}' \; -exec "$scriptdir"/links '{}' \; -exec "$scriptdir"/namespaces '{}' \;
diff --git a/properties b/properties
new file mode 100755
index 0000000..a93df43
--- /dev/null
+++ b/properties
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# Move any property into a block, otherwise the pandoc org parser merges them
+# all into a single line for some reason
+sed -E -i 's/^[a-z]+::.*$/- !property-deleteme!\n  \0/' "$1"
diff --git a/remove-custom-ids.el b/remove-custom-ids.el
new file mode 100644
index 0000000..0e5bdbe
--- /dev/null
+++ b/remove-custom-ids.el
@@ -0,0 +1,7 @@
+;;; remove-custom-ids.el --- Remove CUSTOM_ID property assigned by pandoc org parser -*- lexical-binding: t; -*-
+
+(defun logseq/remove-custom-ids (graphdir)
+  (dolist (file (directory-files-recursively graphdir "org$"))
+    (with-temp-file file
+      (insert-file-contents file)
+      (org-delete-property-globally "CUSTOM_ID"))))
diff --git a/remove-logseq-property-entries.el b/remove-logseq-property-entries.el
new file mode 100644
index 0000000..371a7d0
--- /dev/null
+++ b/remove-logseq-property-entries.el
@@ -0,0 +1,10 @@
+;;; remove-logseq-property-entries.el --- Remove logseq property subtrees -*- lexical-binding: t; -*-
+
+;; this will write to your kill ring!
+(defun logseq/remove-logseq-property-entries (graphdir)
+  "In the 'properties' shell script, we moved each logseq property into a block
+to prevent pandoc's org parser from messing them up. Now that we don't need the
+properties anymore, we delete them."
+  (org-map-entries #'org-cut-subtree
+                   "!property-deleteme!"
+                   (directory-files-recursively graphdir "org$")))