#!/usr/bin/env python3 import re import sys import os search_dir = "/home/hodapp/source/blag/wp-content-old/" search_hints = ["uploads"] dest_dir = "/home/hodapp/source/blag/hugo_blag/static/wp_old" dest_rel = ["/wp_old"] # Matches text like http://.../wp-content/... wrapped in ( and ). # Group 1 is the URL itself. # Group 2 is the base filename (without path). #link_re = re.compile(r"\((http[^)]+)\)") link_re = re.compile(r"\((http[^)]+/([^).]+\.(jpg|jpeg|JPG|JPEG|gif|GIF|png|PNG)))\)") #for n,line in enumerate(sys.stdin): replacements = {} data = [] for n,line in enumerate(open(sys.argv[1], "r")): data.append(line) for url,base,_ in link_re.findall(line): print("# line {}: url={} base={}".format(n, url, base)) parts = url.split("/") # Walk *backwards* from filename, adding one part of the path # at a time, and try to find a file by this name in # 'search_dir' (and with 'search_hints' added one at a time): found = None for i in range(1,len(parts)+1): if found is not None: break for j in range(len(search_hints) + 1): rel = parts[-i:] p = os.path.join(search_dir, *(search_hints[:j] + rel)) #print("try path: {}".format(p)) if os.path.isfile(p): print("# found file: {}".format(p)) dst = os.path.join(dest_dir, *rel) print("mkdir -p {}".format(os.path.join(dest_dir, *rel[:-1]))) print("cp {} {}".format(p, dst)) rel_dst = os.path.join(*(dest_rel + rel)) print("# Map {} to {}".format(url, rel_dst)) replacements[url] = rel_dst found = p break if found is None: print("# *** not found") with open(sys.argv[2], "w+") as f: for line in data: for k in replacements: line = line.replace(k, replacements[k]) f.write(line)