From 26e9cf902f74eb86f414c353c0fd185a791bf79e Mon Sep 17 00:00:00 2001 From: Chris Hodapp Date: Thu, 9 Apr 2020 17:48:32 -0400 Subject: [PATCH] Add my ad-hoc scripts (for now) --- hugo_blag/md_get_wp_links.py | 55 +++++++++++++++++++++++++++++++++ hugo_blag/org_fix_links.py | 59 ++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100755 hugo_blag/md_get_wp_links.py create mode 100755 hugo_blag/org_fix_links.py diff --git a/hugo_blag/md_get_wp_links.py b/hugo_blag/md_get_wp_links.py new file mode 100755 index 0000000..a68df23 --- /dev/null +++ b/hugo_blag/md_get_wp_links.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +import re +import sys +import os + +search_dir = "/home/hodapp/source/blag/wp-content-old/" +search_hints = ["uploads"] + +dest_dir = "/home/hodapp/source/blag/hugo_blag/static/wp_old" +dest_rel = ["/wp_old"] + +# Matches text like http://.../wp-content/... wrapped in ( and ). +# Group 1 is the URL itself. +# Group 2 is the base filename (without path). +#link_re = re.compile(r"\((http[^)]+)\)") +link_re = re.compile(r"\((http[^)]+/([^).]+\.(jpg|jpeg|JPG|JPEG|gif|GIF|png|PNG)))\)") + +#for n,line in enumerate(sys.stdin): +replacements = {} +data = [] +for n,line in enumerate(open(sys.argv[1], "r")): + data.append(line) + for url,base,_ in link_re.findall(line): + print("# line {}: url={} base={}".format(n, url, base)) + parts = url.split("/") + # Walk *backwards* from filename, adding one part of the path + # at a time, and try to find a file by this name in + # 'search_dir' (and with 'search_hints' added one at a time): + found = None + for i in range(1,len(parts)+1): + if found is not None: + break + for j in range(len(search_hints) + 1): + rel = parts[-i:] + p = os.path.join(search_dir, *(search_hints[:j] + rel)) + #print("try path: {}".format(p)) + if os.path.isfile(p): + print("# found file: {}".format(p)) + dst = os.path.join(dest_dir, *rel) + print("mkdir -p {}".format(os.path.join(dest_dir, *rel[:-1]))) + print("cp {} {}".format(p, dst)) + rel_dst = os.path.join(*(dest_rel + rel)) + print("# Map {} to {}".format(url, rel_dst)) + replacements[url] = rel_dst + found = p + break + if found is None: + print("# *** not found") + +with open(sys.argv[2], "w+") as f: + for line in data: + for k in replacements: + line = line.replace(k, replacements[k]) + f.write(line) diff --git a/hugo_blag/org_fix_links.py b/hugo_blag/org_fix_links.py new file mode 100755 index 0000000..6527b1d --- /dev/null +++ b/hugo_blag/org_fix_links.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# ox-hugo handles links that are like [[Text][foo]] - i.e. a link to +# 'foo' which displays as 'Text'. Org also has links like [[foo]] +# which behave identically to [[foo][foo]], i.e. they use the link +# itself as the text to show. ox-hugo doesn't handle these, however - +# they simply disappear in the generated Markdown. +# +# This script just uses a regex to turn [[foo]] to [[foo][foo]]. + +import re +import sys + +# This is a bit dense, but: it matches any text wrapped in [[ and ]] +# provided that text contains no square brackets. That text itself is +# then group 1. +link_re = re.compile(r"\[\[([^][]+)\]\]") + +bare_link_re = re.compile(r"[^][](http\S*(jpg|jpeg|JPG|JPEG|gif|GIF|png|PNG))") + +proper_link_re = re.compile(r"\[\[([^][]+)\]\[([^][]+)\]\]") + +target_re = re.compile(r"<<([^\<\>]+)>>") + +seen = set() +targets = set() +seen_line = {} + +def fix_link(m): + t = m.group(1) + seen.add(t) + return "[[" + t + "][" + t + "]]" + +def fix_image_link(m): + t = m.group(1) + seen.add(t) + descr = "image" + return "[[" + t + "][" + descr + "]]" + +for n,line in enumerate(sys.stdin): + for g1,_ in proper_link_re.findall(line): + seen.add(g1) + seen_line[g1] = n+1 + for g1 in link_re.findall(line): + #print("FOUND: {}".format(g1)) + seen.add(g1) + seen_line[g1] = n-1 + for g1 in target_re.findall(line): + targets.add(g1) + f1 = link_re.sub(fix_link, line) + f2 = bare_link_re.sub(fix_image_link, f1) + sys.stdout.write(f2) + +sys.stdout.write("* Missing links\n") +for link in seen.difference(targets): + if not link.startswith("http"): + if link in seen_line: + line = seen_line[link] + sys.stdout.write("- line {}: <<{}>>\n".format(line, link))