commit 35755f0b05f7cf659ac74dbb22eda197c6ddcded Author: 45mg <45mm.cartridge421@slmail.me> Date: Sat Apr 20 09:49:30 2024 +0000 init diff --git a/README.md b/README.md new file mode 100644 index 0000000..c39433e --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +This is a set of rough scripts that I used to help me convert my Logseq graph to org-roam. Use at your own risk, and please read the code before you use it. + +As I no longer have any use for this code (having finished converting my Logseq graph), I will not be maintaining it in any way. If you need any improvements, you'll have to fork the repo and work on them yourself. + +# Supported +- conversion of Logseq's weird markdown dialect to org-mode: pandoc does most of the actual conversion, but various scripts are needed to massage Logseq's markdown into something it can understand +- converting links: Logseq links/tags are converted to org-roam links; page aliases are supported + +# Not Supported +- queries and embeds +- images and other file assets +- journals (see Instructions below) +- many other things i didn't think of, no doubt + +# Requirements +Tested with: + +``` +pandoc 3.1.9 +emacs 29.1 +``` +In theory newer versions should work. + +# Instructions +- Clone this repo. +- Backup your existing logseq graph folder, just in case. +- These scripts are designed to run on the 'pages' folder of your graph; decide what you're going to do with your journals. I combined them all into a single page like this: +``` sh +for file in journals/*; do + cat "$file" >> pages/journals.md +done +``` +- run the shell script `logseq-migration` on your graph's `pages` folder: + `path/to/this/repo/logseq-migration pages` + This will create a folder named `pages_` containing converted org-mode files; see the comments in `logseq-migration` for details. +- in Emacs, run the code in `logseq-migration.el`. This will convert links and do some other post-processing. diff --git a/add_title b/add_title new file mode 100755 index 0000000..cb60f8d --- /dev/null +++ b/add_title @@ -0,0 +1,9 @@ +#!/bin/bash + +graphdir=$1 +pagepath=$(sed 's/^\///' <<<"${2#"$graphdir"}") +title=${pagepath%.org} + +echo "#+title: $title" > "$2".temp +cat "$2" >> "$2".temp +mv "$2".temp "$2" diff --git a/backtick b/backtick new file mode 100755 index 0000000..ea46c9f --- /dev/null +++ b/backtick @@ -0,0 +1,20 @@ +#!/bin/bash +# Enclose Logseq block ids and block embeds in backticks. +# This is so pandoc will turn them into Org-mode 'verbatim' notation (eg. +# =foo=), which we can then process with elisp. + +# set -x +IFS=$'\n' + +# Match Logseq IDs (extended regex) +id_eregex='[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}' + +# Match the targets of embed/query syntax (extended regex) +target_eregex='([0-9a-f]|\(|\)|\[|\]|-)+' + +# block ids +sed -E -i 's/id:: '"$id_eregex"'/`\0`/' "$1" +# embeds +sed -E -i 's/\{\{ *embed '"$target_eregex"' *\}\}/`\0`/' "$1" +# queries +sed -i 's/{{ *query.*}}/`\0`/' "$1" diff --git a/convert b/convert new file mode 100755 index 0000000..db49ab0 --- /dev/null +++ b/convert @@ -0,0 +1,5 @@ +#!/bin/bash + +DIR=~/bin/scripts/logseq-migration + +find "${1:-.}" -wholename "*.md" -exec "$DIR"/pandoc-cmd '{}' \; -exec rm '{}' \; diff --git a/convert-links.el b/convert-links.el new file mode 100755 index 0000000..28236e3 --- /dev/null +++ b/convert-links.el @@ -0,0 +1,106 @@ +;;; convert-links.el --- convert logseq links to org-id links -*- lexical-binding: t; -*- + +(setq logseq/id-regexp "[0-9a-f]\\{8\\}-\\(?:[0-9a-f]\\{4\\}-\\)\\{3\\}[0-9a-f]\\{12\\}") +(setq logseq/id-spec-regexp (concat "=id:: " logseq/id-regexp "=\\( \\|\n\\)")) +(setq logseq/filelink-target-regexp "[^]]*") +(setq logseq/alias-regexp "^alias:: \\(.*?\\)$") + +(defun logseq/--get-id-part (match) + ;; check if it's an id link + (if (string-match logseq/id-regexp match) + (substring-no-properties (match-string 0 match)) + ;; if not, check if it's a file link + (when (string-match logseq/filelink-regexp match) + (substring-no-properties (match-string 1 match))))) + +(defun logseq/--convert-id-links-in-file (file hmap) + "Generate org-ids in FILE, and return the association with their contexts. +First, convert the file itself into an org-roam node; then, remove logseq +'id::'s and convert the headlines they apply to into org-roam nodes (by +assigning an org id). +Add the associations to HMAP. For file nodes, associate the page title with +the node's id. For headline nodes, associate the replaced logseq id with the +node id." + (find-file file) + (goto-char 0) + ;; convert the file itself into a node + (re-search-forward "#\\+title: \\(.*\\)$") + (let* ((title (substring-no-properties (match-string 1))) + (id (org-id-get-create))) + (puthash title id hmap) + (if (re-search-forward logseq/alias-regexp nil t) + (let ((aliases (split-string (match-string-no-properties 1) ", *"))) + (message "%s" aliases) + (dolist (alias aliases) (puthash alias id hmap)))) + ;; search for logseq ids + (while (re-search-forward logseq/id-spec-regexp nil t) + (let ((match (substring-no-properties (match-string 0)))) + ;; delete logseq id + (replace-match "" nil nil) + (let ((id-part (logseq/--get-id-part match))) + ;; key is the old logseq id; value is newly created org id for this entry + (puthash id-part (org-id-get-create) hmap))))) + (save-buffer) + hmap) + +(defun logseq/convert-id-links (graphdir) + ;; (eq "a" "a") -> nil; (eql "a" "a") -> nil; (equal "a" "a") -> t + (let ((hmap (make-hash-table :test 'equal))) + (dolist (file (directory-files-recursively graphdir "org$")) + (logseq/--convert-id-links-in-file file hmap)) + hmap)) + +(setq logseq/embed-regexp (concat "={{ *\\(embed\\) " logseq/id-regexp " *}}=")) +(setq logseq/query-regexp (concat "={{ *\\(query\\) " logseq/id-regexp " *}}=")) +(setq logseq/link-regexp (concat "\\[\\[file:((" logseq/id-regexp "))\\]\\[\\(.*\\)\\]\\]")) +(setq logseq/filelink-regexp + (concat "\\[\\[file:\\(" logseq/filelink-target-regexp "\\)\\]\\]")) +(setq logseq/blockref-regexp (concat "((" logseq/id-regexp "))")) + +(defun logseq/--replace-with-org-links-in-file (file hmap) + "Replace Logseq link syntax in FILE with org-id links, based on the +associations in HMAP." + (find-file file) + (goto-char 0) + (while (or (re-search-forward logseq/embed-regexp nil t) + (re-search-forward logseq/query-regexp nil t) + (re-search-forward logseq/link-regexp nil t) + (re-search-forward logseq/filelink-regexp nil t) + (re-search-forward logseq/blockref-regexp nil t)) + (let* ((match (substring-no-properties (match-string 0))) + (match-data (match-data)) + (id-part (logseq/--get-id-part match)) + (node-struct (org-roam-node-from-id + (gethash id-part hmap)))) + (set-match-data match-data) + (when node-struct + ;; HACK: prevent org-roam-node-insert from reading a node from + ;; user; instead just use our node + (replace-match "" nil nil) + (cl-letf (((symbol-function 'org-roam-node-read) + (lambda (&rest args) node-struct))) + ;; ((symbol-function 'org-roam-node-formatted) + ;; (lambda (node) (org-roam-node-title node)))) + (org-roam-node-insert))) + (set-match-data match-data) + (unless node-struct + (replace-match (substring-no-properties + (or (match-string 1) "")) nil nil) + (message (format "No node found for %s" match)))) + (goto-char 0)) + (save-buffer)) + +(defun logseq/replace-with-org-links (graphdir hmap) + (dolist (file (directory-files-recursively graphdir "org$")) + (logseq/--replace-with-org-links-in-file file hmap))) + +;; (let* ((file "~/scratchdir/logseq_main_/pages/linux/arch/installation.org") +;; (table (logseq/--convert-id-links-in-file file (make-hash-table)))) +;; (org-roam-db-sync t) +;; (logseq/--replace-with-org-links-in-file file table)) + +(defun logseq/convert-links (graphdir) + (let ((table (logseq/convert-id-links graphdir)) + (org-roam-directory graphdir)) + (org-roam-db-sync t) + (logseq/replace-with-org-links graphdir table))) diff --git a/delete b/delete new file mode 100755 index 0000000..812e1de --- /dev/null +++ b/delete @@ -0,0 +1,4 @@ +#!/bin/bash + +# Delete 'collapsed:: ' +sed -i 's/collapsed:: .*//' "$1" diff --git a/headings b/headings new file mode 100755 index 0000000..bcc9a7e --- /dev/null +++ b/headings @@ -0,0 +1,20 @@ +#!/bin/bash +# Convert nested lists to #, ##, ###... so that pandoc's Org converter will turn +# them into nested headings. + +perl -pi -e ' +# Add an additional layer of nesting - all text in org files should be under a +# heading +s/^/ /; +# Discard any existing Markdown header syntax ("#" characters after the list bullet) +s/( *- )#* /$1/; +# For each level of indentation, add a "#" +s/ (?= *-)/#/g; +# Finally, remove list bullets +s/^(#*)-/$1/; +# Remove any Tab characters remaining (eg. they will still be present in code +# blocks) +s/^ +//' "$1" + +# Add newlines between headings - apparently this is required by Markdown syntax +sed -i -E 's/^#/\'$'\n#/' "$1" diff --git a/links b/links new file mode 100755 index 0000000..65fffe9 --- /dev/null +++ b/links @@ -0,0 +1,14 @@ +#!/bin/bash +# Convert Logseq link syntax into standard Markdown link syntax. + +# Changing the path of links to the assets folder - if you don't know why you'd +# need this, then comment it out + sed -i 's/\[\.\.\/assets/\[assets/g' "$1" + +# Convert '[[pagename]]' links +sed -E -i 's/\[\[(.*)\]\]/[\1](\1)/g' "$1" + +# Convert '#pagename' links +sed -E -i '/```/,/```/ !s/ #([^ ]+)/ [\1](\1)/g' "$1" +# ('!' inverts range - everywhere except in code blocks. We need this because +# code comments may begin with '#') diff --git a/logseq-migration b/logseq-migration new file mode 100755 index 0000000..f7e5876 --- /dev/null +++ b/logseq-migration @@ -0,0 +1,16 @@ +#!/bin/bash + +# Main script, used to convert all the files in a directory to org files. Call +# it with the directory as argument: for example, `./logseq-migration pages`. +# This will create a copy of `pages` called `pages_`, and the conversion will be +# applied to all files in `pages`. Then it will copy `pages_` to `pages__`, +# which will serve as a backup for when you do further processing on `pages_`. + +export scriptdir=~/bin/scripts/logseq-migration + +rm -r "$1"_ "$1"__ +cp -r "$1" "$1"_ +"$scriptdir"/preproc "$1"_ +"$scriptdir"/convert "$1"_ +"$scriptdir"/postproc "$1"_ +cp -r "$1"_ "$1"__ diff --git a/logseq-migration.el b/logseq-migration.el new file mode 100644 index 0000000..915bb07 --- /dev/null +++ b/logseq-migration.el @@ -0,0 +1,10 @@ +;;; logseq-migration.el --- elisp processing of converted logseq graph -*- lexical-binding: t; -*- + +(load-file "convert-links.el") +(load-file "remove-custom-ids.el") +(load-file "remove-logseq-property-entries.el") +(let ((graphdir "path/to/my/graph/pages_")) ;; change as needed + (logseq/remove-custom-ids graphdir) + (logseq/convert-links graphdir) + (logseq/remove-logseq-property-entries graphdir)) + ;; comment out the below line if you don't want to delete logseq properties diff --git a/namespaces b/namespaces new file mode 100755 index 0000000..cbaed6c --- /dev/null +++ b/namespaces @@ -0,0 +1,20 @@ +#!/bin/bash +# Represent Logseq's 'namespaces' by moving the page files into directories. +# For example, a page named 'a/b/c' in Logseq, whose file is named 'a___b___c', +# will stored as 'c' in the path 'a/b'. + +# if the page is not under a namespace, will return the same filename +path=$(sed 's/___/\//g' <<<"$1") + +dir=${path%/*.*} + +# echo $PWD/$dir +# echo $PWD/$path +mkdir -p "$PWD/$dir" + +old=$PWD/$1 +new=$PWD/$path + +# to avoid same-file errors, run `mv` only if the file would actually be moved +# (if it was not under a namespace, $old and $new are the same thing) +[[ $(realpath "$old") = "$(realpath "$new")" ]] || mv "$old" "$new" diff --git a/pandoc-cmd b/pandoc-cmd new file mode 100755 index 0000000..cbc4e85 --- /dev/null +++ b/pandoc-cmd @@ -0,0 +1,4 @@ +#!/bin/bash +# Call pandoc on a .md file to produce a .org file + +pandoc --wrap=none -f markdown -t org -o "${1%.*}.org" "$1" diff --git a/postproc b/postproc new file mode 100755 index 0000000..6fd7e4c --- /dev/null +++ b/postproc @@ -0,0 +1,5 @@ +#!/bin/bash + +scriptdir=~/bin/scripts/logseq-migration + +find "${1:-.}" -wholename "*.org" -exec "$scriptdir"/add_title "$1" '{}' \; diff --git a/preproc b/preproc new file mode 100755 index 0000000..1f5c5d4 --- /dev/null +++ b/preproc @@ -0,0 +1,6 @@ +#!/bin/bash +# Overall preprocessing of Logseq markdown before converting to org + +scriptdir=~/bin/scripts/logseq-migration + +find "${1:-.}" -wholename "*.md" -exec "$scriptdir"/properties '{}' \; -exec "$scriptdir"/backtick '{}' \; -exec "$scriptdir"/delete '{}' \; -exec "$scriptdir"/headings '{}' \; -exec "$scriptdir"/links '{}' \; -exec "$scriptdir"/namespaces '{}' \; diff --git a/properties b/properties new file mode 100755 index 0000000..a93df43 --- /dev/null +++ b/properties @@ -0,0 +1,5 @@ +#!/bin/sh + +# Move any property into a block, otherwise the pandoc org parser merges them +# all into a single line for some reason +sed -E -i 's/^[a-z]+::.*$/- !property-deleteme!\n \0/' "$1" diff --git a/remove-custom-ids.el b/remove-custom-ids.el new file mode 100644 index 0000000..0e5bdbe --- /dev/null +++ b/remove-custom-ids.el @@ -0,0 +1,7 @@ +;;; remove-custom-ids.el --- Remove CUSTOM_ID property assigned by pandoc org parser -*- lexical-binding: t; -*- + +(defun logseq/remove-custom-ids (graphdir) + (dolist (file (directory-files-recursively graphdir "org$")) + (with-temp-file file + (insert-file-contents file) + (org-delete-property-globally "CUSTOM_ID")))) diff --git a/remove-logseq-property-entries.el b/remove-logseq-property-entries.el new file mode 100644 index 0000000..371a7d0 --- /dev/null +++ b/remove-logseq-property-entries.el @@ -0,0 +1,10 @@ +;;; remove-logseq-property-entries.el --- Remove logseq property subtrees -*- lexical-binding: t; -*- + +;; this will write to your kill ring! +(defun logseq/remove-logseq-property-entries (graphdir) + "In the 'properties' shell script, we moved each logseq property into a block +to prevent pandoc's org parser from messing them up. Now that we don't need the +properties anymore, we delete them." + (org-map-entries #'org-cut-subtree + "!property-deleteme!" + (directory-files-recursively graphdir "org$")))