From 26e9cf902f74eb86f414c353c0fd185a791bf79e Mon Sep 17 00:00:00 2001
From: Chris Hodapp <hodapp87@gmail.com>
Date: Thu, 9 Apr 2020 17:48:32 -0400
Subject: [PATCH] Add my ad-hoc scripts (for now)

---
 hugo_blag/md_get_wp_links.py | 55 +++++++++++++++++++++++++++++++++
 hugo_blag/org_fix_links.py   | 59 ++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100755 hugo_blag/md_get_wp_links.py
 create mode 100755 hugo_blag/org_fix_links.py

diff --git a/hugo_blag/md_get_wp_links.py b/hugo_blag/md_get_wp_links.py
new file mode 100755
index 0000000..a68df23
--- /dev/null
+++ b/hugo_blag/md_get_wp_links.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+import os
+
+search_dir = "/home/hodapp/source/blag/wp-content-old/"
+search_hints = ["uploads"]
+
+dest_dir =  "/home/hodapp/source/blag/hugo_blag/static/wp_old"
+dest_rel = ["/wp_old"]
+
+# Matches text like http://.../wp-content/... wrapped in ( and ).
+# Group 1 is the URL itself.
+# Group 2 is the base filename (without path).
+#link_re = re.compile(r"\((http[^)]+)\)")
+link_re = re.compile(r"\((http[^)]+/([^).]+\.(jpg|jpeg|JPG|JPEG|gif|GIF|png|PNG)))\)")
+
+#for n,line in enumerate(sys.stdin):
+replacements = {}
+data = []
+for n,line in enumerate(open(sys.argv[1], "r")):
+    data.append(line)
+    for url,base,_ in link_re.findall(line):
+        print("# line {}: url={} base={}".format(n, url, base))
+        parts = url.split("/")
+        # Walk *backwards* from filename, adding one part of the path
+        # at a time, and try to find a file by this name in
+        # 'search_dir' (and with 'search_hints' added one at a time):
+        found = None
+        for i in range(1,len(parts)+1):
+            if found is not None:
+                break
+            for j in range(len(search_hints) + 1):
+                rel = parts[-i:]
+                p = os.path.join(search_dir, *(search_hints[:j] + rel))
+                #print("try path: {}".format(p))
+                if os.path.isfile(p):
+                    print("# found file: {}".format(p))
+                    dst = os.path.join(dest_dir, *rel)
+                    print("mkdir -p {}".format(os.path.join(dest_dir, *rel[:-1])))
+                    print("cp {} {}".format(p, dst))
+                    rel_dst = os.path.join(*(dest_rel + rel))
+                    print("# Map {} to {}".format(url, rel_dst))
+                    replacements[url] = rel_dst
+                    found = p
+                    break
+        if found is None:
+            print("# *** not found")
+
+with open(sys.argv[2], "w+") as f:
+    for line in data:
+        for k in replacements:
+            line = line.replace(k, replacements[k])
+        f.write(line)
diff --git a/hugo_blag/org_fix_links.py b/hugo_blag/org_fix_links.py
new file mode 100755
index 0000000..6527b1d
--- /dev/null
+++ b/hugo_blag/org_fix_links.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# ox-hugo handles links that are like [[Text][foo]] - i.e. a link to
+# 'foo' which displays as 'Text'.  Org also has links like [[foo]]
+# which behave identically to [[foo][foo]], i.e. they use the link
+# itself as the text to show.  ox-hugo doesn't handle these, however -
+# they simply disappear in the generated Markdown.
+#
+# This script just uses a regex to turn [[foo]] to [[foo][foo]].
+
+import re
+import sys
+
+# This is a bit dense, but: it matches any text wrapped in [[ and ]]
+# provided that text contains no square brackets.  That text itself is
+# then group 1.
+link_re = re.compile(r"\[\[([^][]+)\]\]")
+
+bare_link_re = re.compile(r"[^][](http\S*(jpg|jpeg|JPG|JPEG|gif|GIF|png|PNG))")
+
+proper_link_re = re.compile(r"\[\[([^][]+)\]\[([^][]+)\]\]")
+
+target_re = re.compile(r"<<([^\<\>]+)>>")
+
+seen = set()
+targets = set()
+seen_line = {}
+
+def fix_link(m):
+    t = m.group(1)
+    seen.add(t)
+    return "[[" + t + "][" + t + "]]"
+
+def fix_image_link(m):
+    t = m.group(1)
+    seen.add(t)
+    descr = "image"
+    return "[[" + t + "][" + descr + "]]"
+
+for n,line in enumerate(sys.stdin):
+    for g1,_ in proper_link_re.findall(line):
+        seen.add(g1)
+        seen_line[g1] = n+1
+    for g1 in link_re.findall(line):
+        #print("FOUND: {}".format(g1))
+        seen.add(g1)
+        seen_line[g1] = n-1
+    for g1 in target_re.findall(line):
+        targets.add(g1)
+    f1 = link_re.sub(fix_link, line)
+    f2 = bare_link_re.sub(fix_image_link, f1)
+    sys.stdout.write(f2)
+
+sys.stdout.write("* Missing links\n")
+for link in seen.difference(targets):
+    if not link.startswith("http"):
+        if link in seen_line:
+            line = seen_line[link]
+            sys.stdout.write("- line {}: <<{}>>\n".format(line, link))