ai-migrate.zip (40.1 KB)
(for all replace josefkulovany.com or WEBSITENAME with your own website)
Being myself, and often, a canary in the coalmine for others, I was, myself, yet again censored in the most uncouth way; few months back. This time, when Hostinger decided to delete my hosting plan and take my funds, pre-paid over two years in advance, I was left with little choice but to migrate a personal website of mine which had been automatically generated using AI. Being auto-generated by AI, and with no free migration tool readily available, I did what anyone would do: I created my own scripts for migration.
And while these scripts didn’t entirely work for me, they did pretty darn well work for me, and so it follows that they might be good fodder for you, dear user, should mooks like Hostinger ever give you the boot, too. God bless.
My manually migrated portfolio: https://josefkulovany.com/
This suite of scripts assumes you have already created a local copy of your website using the browser’s ‘save as’ option under file and/or your right click, save as. This means your files are already in a folder locally. This local folder appears to be good to go, but often will reference globally saved images on the CDN. This means that, even if you download images locally, they are still referenced in a long boiler plate website. This means your simple website is in fact consisting of so much code referencing so many other websites it is rendered useless should you ever choose to leave. This means as soon as you stop paying them, your images will stop showing up even though you’ve saved them locally. Never fear, zCHG.org is here…
The following scripts can easily be adapted for other AI website generators e.g. Wix. Just copy/paste a given script into a bot’s prompt, and prompt with something like “Adapt the following script for my website” after looking at the source code of your website for clues of the CDN being utilized by that particular AI website generator.
You can also run more than one script to a given website as a kind of filter. Say after you’ve parsed a given website, now you want to minify as a second parse, you can do that. Useful. Useful because, although you may have achieved stand-alone, you did so keeping too much boiler plate. Useful because you loved the design but hated the frill. Useful, because it’s YOUR property now, and why would anyone want property that isn’t easy to edit?
hybrid_recompile.py (untested) - This was suggested to me after the project was done, it probably works mostly pretty good. It combines all five of the other script’s best attributes per the table.
import os
import re
import shutil
import base64
import mimetypes
import requests
from pathlib import Path
from urllib.parse import urljoin, quote
from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin
BASE_URL = "https://josefkulovany.com/"
SCRIPT_DIR = Path(__file__).resolve().parent
SRC_DIR = SCRIPT_DIR
OUT_DIR = SCRIPT_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(parents=True, exist_ok=True)
ASSETS_DIR.mkdir(parents=True, exist_ok=True)
COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif", ".png", ".jpg", ".jpeg"}
def read_file(path):
return path.read_text(encoding="utf-8", errors="ignore")
def write_file(path, content):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def sanitize_filename(filename):
return quote(filename, safe='').replace('%20', '_').replace('%', '_')
def copy_asset(src_path):
src_path = src_path.resolve()
if not src_path.exists() or not src_path.is_file():
return None
dest_name = src_path.name
dest_path = ASSETS_DIR / dest_name
counter = 1
while dest_path.exists():
stem = src_path.stem
suffix = src_path.suffix
dest_name = f"{stem}_{counter}{suffix}"
dest_path = ASSETS_DIR / dest_name
counter += 1
shutil.copy2(src_path, dest_path)
return Path("assets") / dest_name
def encode_image_base64(local_path):
try:
mime, _ = mimetypes.guess_type(str(local_path))
if mime is None:
mime = "application/octet-stream"
b64_data = base64.b64encode(local_path.read_bytes()).decode('utf-8')
return f"data:{mime};base64,{b64_data}"
except Exception:
return None
def inline_css(css_path, base_path, processed=None):
if processed is None:
processed = set()
css_path = (base_path / css_path).resolve()
if not css_path.exists() or css_path in processed:
return ""
processed.add(css_path)
css_text = read_file(css_path)
# Recursive @import
def import_replacer(match):
import_url = match.group(1).strip("'\"")
if import_url.startswith("http") or import_url.startswith("data:"):
return match.group(0)
return inline_css(import_url, css_path.parent, processed)
css_text = re.sub(r'@import\s+url\(([^)]+)\);?', import_replacer, css_text)
css_text = re.sub(r'@import\s+["\']([^"\']+)["\'];?', import_replacer, css_text)
def url_replacer(match):
orig_url = match.group(1).strip("'\"")
if orig_url.startswith("data:") or orig_url.startswith("http"):
return f"url({orig_url})"
asset_path = (css_path.parent / orig_url).resolve()
copied = copy_asset(asset_path)
return f"url({copied.as_posix()})" if copied else f"url({orig_url})"
css_text = re.sub(r'url\(([^)]+)\)', url_replacer, css_text)
return compress(css_text)
def process_html(html_path):
soup = BeautifulSoup(read_file(html_path), "html.parser")
base_path = html_path.parent
if not soup.find("base"):
soup.head.insert(0, soup.new_tag("base", href="./"))
# Inline CSS
for link in soup.find_all("link", rel="stylesheet"):
href = link.get("href", "")
if href.startswith("http"):
link.decompose()
continue
try:
css_code = inline_css(href, base_path)
style_tag = soup.new_tag("style")
style_tag.string = css_code
link.replace_with(style_tag)
except:
link.decompose()
# Inline JS
for script in soup.find_all("script", src=True):
src = script["src"]
if src.startswith("http"):
script.decompose()
continue
js_path = (base_path / src).resolve()
try:
js_code = jsmin(read_file(js_path))
new_script = soup.new_tag("script")
new_script.string = js_code
script.replace_with(new_script)
except:
script.decompose()
# Inline images or fallback to BASE_URL
for img in soup.find_all("img", src=True):
src = img["src"]
img_path = (base_path / src).resolve()
if img_path.exists():
data_uri = encode_image_base64(img_path)
if data_uri:
img["src"] = data_uri
img.attrs.pop("srcset", None)
continue
img["src"] = urljoin(BASE_URL, src)
# Other assets (audio, video, source, iframe, etc.)
for tag in soup.find_all(src=True):
src = tag["src"]
if src.startswith("http") or src.startswith("data:"):
continue
asset_path = (base_path / src).resolve()
copied = copy_asset(asset_path)
tag["src"] = copied.as_posix() if copied else urljoin(BASE_URL, src)
# Fonts/icons/etc
for tag in soup.find_all(href=True):
href = tag["href"]
if href.startswith(("http", "data:", "#")):
continue
asset_path = (base_path / href).resolve()
ext = asset_path.suffix.lower()
copied = copy_asset(asset_path) if ext in COPY_EXTENSIONS else None
tag["href"] = copied.as_posix() if copied else urljoin(BASE_URL, href)
return minify(str(soup), remove_empty_space=True, remove_comments=True)
def recompile_all():
for html_path in SRC_DIR.rglob("*.html"):
rel_path = html_path.relative_to(SRC_DIR)
out_path = OUT_DIR / rel_path
try:
compiled = process_html(html_path)
write_file(out_path, compiled)
print(f"✅ Compiled: {rel_path}")
except Exception as e:
print(f"❌ Error processing {rel_path}: {e}")
if __name__ == "__main__":
recompile_all()
Feature / Script | remove-global1.py |
remove-global2.py |
remove-global3.py |
structure-and-image.py |
structure_capture.py |
hybrid_recompile.py |
---|---|---|---|---|---|---|
Purpose | Replace Zyro image URLs | Simple asset inliner | Full local site compiler | Image inliner + URL rewiring | Live site resource rewiring | ![]() |
CSS Inlining | ![]() |
![]() |
![]() @import ) |
![]() |
![]() |
![]() @import ) |
JS Inlining | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
IMG Inlining (base64) | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Fallback to Live URL | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Minify HTML | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Asset Copying (fonts, etc.) | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Handles CSS @import recursively |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Filename Deduplication in /assets | ![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Processes Remote Resources (live fetch) | ![]() |
![]() |
![]() |
![]() |
![]() requests ) |
![]() |
Flat vs Tree Output Structure | Tree | Tree | Tree | Flat | Flat | Tree |
Base <base href="./"> Injection |
![]() |
![]() |
![]() |
![]() |
![]() |
![]() |
Dependencies | Minimal (regex, pathlib) | bs4 , htmlmin , jsmin |
bs4 , htmlmin , jsmin |
bs4 |
bs4 , requests |
All above + mimetypes , requests |
Ideal Use Case | Pre-clean Zyro exports | Local-only quick builds | Full self-contained site | Portable preview with live fallback | Mirroring or syncing to live site | ![]() |
The following is a script for repairing broken images. No idea why, but my own homepage was the only page I couldn’t resurrect in full. The images were broken base64. Base64 strings you say? More on this later on.. Any case, broken base64 strings do no good, so this handy dandy tool replaces all images in a given page (index.html) with images which are stored in the same folder as our target file (index.html) and script. It’s useful for that.
repair-2.py
from bs4 import BeautifulSoup
from pathlib import Path
# Input/Output HTML
html_path = Path("/home/josefkulovany/recompiled-pages/index.html")
output_path = html_path.with_name("index_manual_mapped.html")
# Your real files
real_images = [
"272097746_2889050794718330_2429556499248538454_n-Yg21D3gZlKtaKOEq.jpg",
"272170262_2889010071389069_839111276987981319_n-YbNDK81XJoflBe4N.jpg",
"293256215_3011295135827228_1062208197907635003_n-AoPEOLNpbxSvk2Dr.jpg",
"311983307_3096503063973101_4624707471446359481_n-mk3Leaz0bls5W4lq.jpg",
"312918800_3096502983973109_8016347582259544278_n-Yg21DZPzj9FEQra2.jpg",
"313425110_3097484513874956_5709197671429153513_n-YbNDKoq75WTWyeRZ.jpg",
"img_2282-Yg21DoElqKu6lW0g.JPG",
"unadjustednonraw_thumb_e21-YKboBnV1RpsjBGW2.jpg",
"untitled-meP38MLj8QhL60qL.png"
]
# Load HTML
soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")
# Replace base64 images
for i, img in enumerate(soup.find_all("img")):
src = img.get("src", "")
if src.startswith("data:image"):
print(f"\n🔍 Found base64 image #{i+1}")
print("Nearby context:")
print("-" * 40)
parent = img.find_parent()
if parent:
snippet = parent.get_text().strip()
print(snippet[:300])
print("-" * 40)
# Show options
print("Choose replacement image (or 's' to skip, 'q' to quit):")
for idx, name in enumerate(real_images):
print(f"{idx + 1}. {name}")
while True:
choice = input("Your choice [number | s | q]: ").strip().lower()
if choice == 's':
print("⏭️ Skipped.")
break
elif choice == 'q':
print("👋 Exiting early. Saving progress...")
output_path.write_text(str(soup), encoding="utf-8")
exit(0)
elif choice.isdigit():
num = int(choice)
if 1 <= num <= len(real_images):
replacement = real_images[num - 1]
img['src'] = f"./{replacement}"
print(f"✅ Replaced with: {replacement}")
break
print("❌ Invalid input. Try again.")
# Save output
output_path.write_text(str(soup), encoding="utf-8")
print(f"\n🎉 Finished. Updated file saved to: {output_path}")
The following does a great job of capturing the structure of a given website, but lost track of images in my case:
structure_capture.py
import os
from bs4 import BeautifulSoup
from pathlib import Path
import requests
from urllib.parse import urljoin, urlparse, quote
import base64
# Use the actual location of the script
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
BASE_URL = "https://josefkulovany.com/"
EMBED_RESOURCES = False # Set to True to inline CSS/images as base64
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
def sanitize_filename(filename):
return quote(filename, safe='').replace('%20', '_').replace('%', '_')
def embed_resource(url):
try:
res = requests.get(url)
res.raise_for_status()
content_type = res.headers.get("Content-Type", "application/octet-stream")
data = base64.b64encode(res.content).decode('utf-8')
return f"data:{content_type};base64,{data}"
except Exception as e:
print(f"Error embedding {url}: {e}")
return url
def process_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# Adjust base tag
base_tag = soup.find("base")
if base_tag:
base_tag["href"] = "./"
else:
soup.head.insert(0, soup.new_tag("base", href="./"))
# Fix resource links
for tag, attr in [("link", "href"), ("script", "src"), ("img", "src")]:
for el in soup.find_all(tag):
if el.has_attr(attr):
original_url = el[attr]
full_url = urljoin(BASE_URL, original_url)
if EMBED_RESOURCES and tag in ["link", "img"]:
if tag == "link" and "stylesheet" in el.get("rel", []):
try:
css = requests.get(full_url).text
style_tag = soup.new_tag("style")
style_tag.string = css
el.replace_with(style_tag)
except:
continue
elif tag == "img":
el[attr] = embed_resource(full_url)
else:
el[attr] = full_url
return str(soup)
def recompile_all():
for file in INPUT_DIR.glob("*.html"):
sanitized = sanitize_filename(file.name)
output_file = OUTPUT_DIR / sanitized
try:
compiled_html = process_html(file)
with open(output_file, "w", encoding="utf-8") as out:
out.write(compiled_html)
print(f"✔️ Recompiled: {file.name} -> {output_file.name}")
except Exception as e:
print(f"❌ Error processing {file.name}: {e}")
if __name__ == "__main__":
recompile_all()
The following is the best generic structure + image scrape I have. Everything beyond this will need to be custom tailored to your website (scroll down for example)
structure-and-image.py
import os
from pathlib import Path
from urllib.parse import urljoin, quote
import base64
import mimetypes
from bs4 import BeautifulSoup
# Paths
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BASE_URL = "https://josefkulovany.com/"
def sanitize_filename(filename):
return quote(filename, safe='').replace('%20', '_').replace('%', '_')
def encode_image_to_base64(local_path):
try:
mime, _ = mimetypes.guess_type(str(local_path))
if mime is None:
mime = "application/octet-stream"
with open(local_path, "rb") as f:
b64_data = base64.b64encode(f.read()).decode('utf-8')
return f"data:{mime};base64,{b64_data}"
except Exception:
return None
def process_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# Ensure <base href="./"> for relative links
if not soup.find("base"):
soup.head.insert(0, soup.new_tag("base", href="./"))
base_path = file_path.parent
# For link and script tags: rewrite URLs to live absolute URLs (preserve site structure and CSS/JS loading)
for tag, attr in [("link", "href"), ("script", "src")]:
for el in soup.find_all(tag):
if el.has_attr(attr):
original_url = el[attr]
# Make absolute URL on live site
live_url = urljoin(BASE_URL, original_url)
el[attr] = live_url
# For img tags: try local file, inline base64 if possible, else fallback to live absolute URL
for img in soup.find_all("img", src=True):
orig_src = img["src"]
local_img_path = (base_path / orig_src).resolve()
if local_img_path.exists() and local_img_path.is_file():
data_uri = encode_image_to_base64(local_img_path)
if data_uri:
img["src"] = data_uri
# Remove srcset if present because it conflicts with data URI
if "srcset" in img.attrs:
del img.attrs["srcset"]
else:
# Could not encode, fallback to live URL
img["src"] = urljoin(BASE_URL, orig_src)
else:
# No local file, use live URL
img["src"] = urljoin(BASE_URL, orig_src)
return str(soup)
def recompile_all():
for file in INPUT_DIR.glob("*.html"):
sanitized = sanitize_filename(file.name)
output_file = OUTPUT_DIR / sanitized
try:
compiled_html = process_html(file)
with open(output_file, "w", encoding="utf-8") as out:
out.write(compiled_html)
print(f"✔️ Recompiled: {file.name} -> {output_file.name}")
except Exception as e:
print(f"❌ Error processing {file.name}: {e}")
if __name__ == "__main__":
recompile_all()
These are older versions of the same thing. They are more generic fodder.
remove-global1.py
— Targeted Zyro CDN Image Replacer
Usefulness:
- Specifically designed to replace remote Zyro-hosted image URLs (
https://assets.zyrosite.com/...
) with local equivalents. - It scans HTML/PHP files, searches for Zyro image URLs, and replaces them if a matching file is found locally.
Key Features:
- Minimal and fast.
- Uses
re
(regex) for pattern matching andPath
for filesystem navigation. - Only focuses on remote image replacement, not inlining or asset compression.
Best For:
- Cleaning up remote references from a Zyro-exported site.
- Light preprocessing before more aggressive compilation/inlining.
remove-global2.py
— Basic Inliner and Recompiler
Usefulness:
- Inlines local CSS, JS, and IMG files.
- Copies non-inlined assets (fonts, media, icons) to a centralized
assets/
folder. - Removes external references (e.g., to CDN scripts or styles).
- Minifies the resulting HTML.
Key Features:
- Clean, readable structure.
- Handles basic inlining and asset copying.
- No recursion on
@import
in CSS. - Less robust in file name conflict resolution (e.g., assumes unique asset names).
Best For:
- Quick inlining and cleanup of a local site.
- Small projects or simple export workflows.
remove-global3.py
— Advanced Recursive Inliner with Conflict Handling
Usefulness:
- Recursively inlines
@import
statements in CSS files. - Automatically resolves filename conflicts in
assets/
by renaming. - Extends asset support to images and more media types (e.g.,
.jpg
,.png
,.jpeg
). - More fault-tolerant and comprehensive than v2.
Key Features:
- Prevents infinite recursion in CSS
@import
s using a set of processed files. - Resolves duplicate asset names by appending numeric suffixes (e.g.,
font_1.woff
). - More complete
COPY_EXTENSIONS
set than v2. - More robust and production-safe.
Best For:
- Full site compilation with complex CSS imports.
- Real-world exports where duplicate asset names are common.
- Handling large or messy input folders.
remove-global1.py
import re
from pathlib import Path
# Root folder of your website files
ROOT = Path("/home/WEBSITENAME")
# Folder(s) where local images reside (adjust if images scattered elsewhere)
LOCAL_IMAGE_FOLDERS = [
ROOT / "images",
ROOT / "assets" / "images",
# add more if needed
]
# Regex to find remote Zyro CDN URLs in img src or other tags
REMOTE_IMG_URL_PATTERN = re.compile(
r'https://assets\.zyrosite\.com/[^"\']+'
)
def find_local_image(filename):
"""Search all LOCAL_IMAGE_FOLDERS for a file with given filename"""
for folder in LOCAL_IMAGE_FOLDERS:
candidate = folder / filename
if candidate.exists():
return candidate.relative_to(ROOT)
# fallback: try anywhere under ROOT
for candidate in ROOT.rglob(filename):
if candidate.is_file():
return candidate.relative_to(ROOT)
return None
def fix_file(file_path):
content = file_path.read_text(encoding="utf-8", errors="ignore")
changed = False
urls = set(REMOTE_IMG_URL_PATTERN.findall(content))
for url in urls:
filename = url.split("/")[-1].split("?")[0].split("#")[0]
local_rel_path = find_local_image(filename)
if local_rel_path:
# Use forward slashes in HTML paths
local_rel_str = "/" + str(local_rel_path).replace("\\", "/")
content = content.replace(url, local_rel_str)
changed = True
print(f"Replaced {url} → {local_rel_str} in {file_path}")
else:
print(f"⚠️ WARNING: local image file not found for {filename} referenced in {file_path}")
if changed:
file_path.write_text(content, encoding="utf-8")
def main():
print(f"Scanning {ROOT} for .php and .html files to fix image URLs...\n")
for file_path in ROOT.rglob("*"):
if file_path.suffix.lower() in {".php", ".html"}:
fix_file(file_path)
print("\nAll done!")
if __name__ == "__main__":
main()
remove-global2.py
import os
import shutil
import base64
from pathlib import Path
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin
SRC_DIR = Path("/home/WEBSITENAME").resolve()
OUT_DIR = SRC_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(exist_ok=True, parents=True)
ASSETS_DIR.mkdir(exist_ok=True, parents=True)
# Asset file extensions to copy (non-inlined)
COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif"}
def read_file(path):
return path.read_text(encoding="utf-8", errors="ignore")
def write_file(path, content):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def copy_asset(src_path):
"""
Copy a local asset into the assets folder if not already copied.
Returns the relative path inside OUT_DIR to use in rewritten HTML.
"""
src_path = src_path.resolve()
if not src_path.exists() or not src_path.is_file():
return None
dest_path = ASSETS_DIR / src_path.name
if not dest_path.exists():
shutil.copy2(src_path, dest_path)
return Path("assets") / src_path.name
def inline_css(css_path, base_path):
css_path = (base_path / css_path).resolve()
if not css_path.exists():
return ""
css_text = read_file(css_path)
# Find url(...) references inside CSS to copy assets and rewrite URLs
def replace_url(match):
url = match.group(1).strip('\'"')
if url.startswith("data:") or url.startswith("http"):
return f"url({url})"
asset_path = (css_path.parent / url).resolve()
copied = copy_asset(asset_path)
if copied:
return f"url({copied.as_posix()})"
return f"url({url})"
import re
css_text = re.sub(r"url\(([^)]+)\)", replace_url, css_text)
return compress(css_text)
def inline_assets(html_path):
html_path = html_path.resolve()
html = read_file(html_path)
soup = BeautifulSoup(html, "html.parser")
base_path = html_path.parent
# Inline <link rel=stylesheet>
for link in soup.find_all("link", rel="stylesheet"):
href = link.get("href", "")
if href.startswith("http"):
# External URL - remove or keep? Here, remove
link.decompose()
continue
try:
css_code = inline_css(href, base_path)
style_tag = soup.new_tag("style")
style_tag.string = css_code
link.replace_with(style_tag)
except Exception:
link.decompose()
# Inline <script src=...>
for script in soup.find_all("script", src=True):
src = script["src"]
if src.startswith("http"):
script.decompose()
continue
js_path = (base_path / src).resolve()
if js_path.exists():
try:
js_code = jsmin(read_file(js_path))
new_script = soup.new_tag("script")
new_script.string = js_code
script.replace_with(new_script)
except Exception:
script.decompose()
else:
script.decompose()
# Inline <img src=...>
for img in soup.find_all("img", src=True):
src = img["src"]
if src.startswith("http"):
# External - remove or keep? Remove here
img.decompose()
continue
img_path = (base_path / src).resolve()
if img_path.exists() and img_path.is_file():
try:
mime = f"image/{img_path.suffix.lstrip('.')}"
b64 = base64.b64encode(img_path.read_bytes()).decode()
img["src"] = f"data:{mime};base64,{b64}"
# Also remove srcset attribute if present
if "srcset" in img.attrs:
del img.attrs["srcset"]
except Exception:
img.decompose()
else:
img.decompose()
# Rewrite other asset references in tags (audio, video, source, etc.)
for tag in soup.find_all(src=True):
src = tag["src"]
if src.startswith("http") or src.startswith("data:"):
continue
asset_path = (base_path / src).resolve()
copied = copy_asset(asset_path)
if copied:
tag["src"] = copied.as_posix()
else:
# Remove tags referencing missing assets
tag.decompose()
# Rewrite href assets (fonts, icons, etc.)
for tag in soup.find_all(href=True):
href = tag["href"]
if href.startswith("http") or href.startswith("data:") or href.startswith("#"):
continue
asset_path = (base_path / href).resolve()
ext = asset_path.suffix.lower()
if ext in COPY_EXTENSIONS:
copied = copy_asset(asset_path)
if copied:
tag["href"] = copied.as_posix()
else:
tag.decompose()
# Minify final HTML output
minified_html = minify(str(soup), remove_empty_space=True, remove_comments=True)
return minified_html
def process_all_pages():
for root, _, files in os.walk(SRC_DIR):
root_path = Path(root)
# Skip output folders to avoid recursion
if root_path == OUT_DIR or root_path.is_relative_to(OUT_DIR):
continue
for file in files:
if file.lower().endswith((".html", ".php")):
html_path = root_path / file
rel_path = html_path.relative_to(SRC_DIR)
out_path = OUT_DIR / rel_path
try:
compiled_html = inline_assets(html_path)
write_file(out_path.with_suffix(".html"), compiled_html)
print(f"✅ Compiled: {out_path.with_suffix('.html')}")
except Exception as e:
print(f"❌ Error processing {html_path}: {e}")
if __name__ == "__main__":
process_all_pages()
remove-global3.py
import os
import re
import shutil
import base64
from pathlib import Path
from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin
SRC_DIR = Path("/home/WEBSITENAME").resolve()
OUT_DIR = SRC_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(exist_ok=True, parents=True)
ASSETS_DIR.mkdir(exist_ok=True, parents=True)
COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif", ".png", ".jpg", ".jpeg"}
def read_file(path):
return path.read_text(encoding="utf-8", errors="ignore")
def write_file(path, content):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def copy_asset(src_path):
src_path = src_path.resolve()
if not src_path.exists() or not src_path.is_file():
return None
# Create a unique filename in assets folder to avoid collisions
dest_name = src_path.name
dest_path = ASSETS_DIR / dest_name
counter = 1
while dest_path.exists():
# If collision, add suffix number
stem = src_path.stem
suffix = src_path.suffix
dest_name = f"{stem}_{counter}{suffix}"
dest_path = ASSETS_DIR / dest_name
counter += 1
shutil.copy2(src_path, dest_path)
return Path("assets") / dest_name
def inline_css(css_path, base_path, processed_files=None):
"""
Read CSS file, recursively inline @import CSS,
fix URLs to copied assets, and compress.
"""
if processed_files is None:
processed_files = set()
css_path = (base_path / css_path).resolve()
if css_path in processed_files:
return "" # Avoid circular imports
processed_files.add(css_path)
if not css_path.exists():
return ""
css_text = read_file(css_path)
# Handle @import statements recursively
def import_replacer(match):
import_url = match.group(1).strip('\'"')
# Only process local imports
if import_url.startswith("http") or import_url.startswith("data:"):
return match.group(0)
imported_css = inline_css(import_url, css_path.parent, processed_files)
return imported_css
css_text = re.sub(r'@import\s+url\(([^)]+)\);?', import_replacer, css_text)
css_text = re.sub(r'@import\s+["\']([^"\']+)["\'];?', import_replacer, css_text)
# Fix url(...) paths: copy assets and rewrite URLs to new assets folder
def url_replacer(match):
orig_url = match.group(1).strip('\'"')
if orig_url.startswith("data:") or orig_url.startswith("http"):
return f"url({orig_url})"
asset_path = (css_path.parent / orig_url).resolve()
copied = copy_asset(asset_path)
if copied:
return f"url({copied.as_posix()})"
else:
return f"url({orig_url})"
css_text = re.sub(r'url\(([^)]+)\)', url_replacer, css_text)
# Compress CSS
return compress(css_text)
def inline_assets(html_path):
html_path = html_path.resolve()
html = read_file(html_path)
soup = BeautifulSoup(html, "html.parser")
base_path = html_path.parent
# Inline <link rel=stylesheet> (CSS)
for link in soup.find_all("link", rel="stylesheet"):
href = link.get("href", "")
if href.startswith("http"):
# External CSS: skip or remove?
link.decompose()
continue
try:
css_code = inline_css(href, base_path)
style_tag = soup.new_tag("style")
style_tag.string = css_code
link.replace_with(style_tag)
except Exception:
link.decompose()
# Inline <script src=...>
for script in soup.find_all("script", src=True):
src = script["src"]
if src.startswith("http"):
# Remove external scripts (or keep if needed)
script.decompose()
continue
js_path = (base_path / src).resolve()
if js_path.exists():
try:
js_code = jsmin(read_file(js_path))
new_script = soup.new_tag("script")
new_script.string = js_code
script.replace_with(new_script)
except Exception:
script.decompose()
else:
script.decompose()
# Inline images <img>
for img in soup.find_all("img", src=True):
src = img["src"]
if src.startswith("http"):
img.decompose()
continue
img_path = (base_path / src).resolve()
if img_path.exists() and img_path.is_file():
try:
mime = f"image/{img_path.suffix.lstrip('.')}"
b64 = base64.b64encode(img_path.read_bytes()).decode()
img["src"] = f"data:{mime};base64,{b64}"
if "srcset" in img.attrs:
del img.attrs["srcset"]
except Exception:
img.decompose()
else:
img.decompose()
# Rewrite other src attributes (audio, video, source, iframe, etc)
for tag in soup.find_all(src=True):
src = tag["src"]
if src.startswith("http") or src.startswith("data:"):
continue
asset_path = (base_path / src).resolve()
copied = copy_asset(asset_path)
if copied:
tag["src"] = copied.as_posix()
else:
tag.decompose()
# Rewrite href attributes for fonts, icons, etc.
for tag in soup.find_all(href=True):
href = tag["href"]
if href.startswith("http") or href.startswith("data:") or href.startswith("#"):
continue
asset_path = (base_path / href).resolve()
ext = asset_path.suffix.lower()
if ext in COPY_EXTENSIONS:
copied = copy_asset(asset_path)
if copied:
tag["href"] = copied.as_posix()
else:
tag.decompose()
# Minify final HTML output
minified_html = minify(str(soup), remove_empty_space=True, remove_comments=True)
return minified_html
def process_all_pages():
for root, _, files in os.walk(SRC_DIR):
root_path = Path(root)
# Skip output folder to prevent recursion
if root_path == OUT_DIR or OUT_DIR in root_path.parents:
continue
for file in files:
if file.lower().endswith((".html", ".php")):
html_path = root_path / file
rel_path = html_path.relative_to(SRC_DIR)
out_path = OUT_DIR / rel_path
try:
compiled_html = inline_assets(html_path)
write_file(out_path.with_suffix(".html"), compiled_html)
print(f"✅ Compiled: {out_path.with_suffix('.html')}")
except Exception as e:
print(f"❌ Error processing {html_path}: {e}")
if __name__ == "__main__":
process_all_pages()
SYMLINK TOOLS
The following are precursors to the main above scripts, scroll down below them for our UNO REVERSE card, which undoes these script’s effects, more or less..
- map_images_to_pages.py: scanning pages and mapping image references (detects missing vs found images).
- help-recursive.py: scanning pages for asset references, creating symlinks to local files if missing, trying remote downloads, or creating placeholders.
- map_assets_symlinks.py: scanning pages and media references to create symlinks for all found media in a centralized folder.
1. Image Reference Mapping Script
What it does:
- Recursively scans your entire project folder for:
- Web page files (
.html
,.php
) - Image files (
.jpg
,.png
,.svg
, etc.)
- Web page files (
- Extracts all image references (
src="..."
orurl(...)
) inside the web pages. - Checks if each referenced image actually exists on disk.
- Builds a report listing:
- Found images that exist and which pages reference them.
- Missing images that are referenced but not found.
- Outputs a summary and writes the detailed report to
image_mapping_report.txt
.
What it’s useful for:
- Identifying broken or missing image links in your web project.
- Understanding which pages reference which images.
- Helping clean up or fix missing asset issues.
- Preparing for asset reorganization or migration.
2. Asset Symlinking, Downloading, and Placeholder Script
What it does:
- Scans
.html
and.php
files for common asset references (images, scripts, stylesheets, iframes, videos, audio, objects). - For each asset reference, it:
- Checks if the asset exists locally.
- If missing, tries to find a local file with the same filename anywhere in the project.
- If still missing, tries downloading from known remote prefixes (like YouTube, Google Fonts).
- If download fails, creates a placeholder file.
- Creates symbolic links from the expected asset location to the found/downloaded/placeholder asset.
- Tracks statistics on how many symlinks, downloads, placeholders, and existing assets it handled.
What it’s useful for:
- Automatically repairing broken or missing asset links by linking or downloading assets.
- Creating placeholders so the site still works even if assets are missing.
- Simplifying asset management by centralizing missing assets into fallback directories.
- Useful during migration, deployment, or cleaning up incomplete projects.
3. Media Symlinking Script
What it does:
- Walks through the project directory recursively.
- For each
.html
or.php
file, parses it (using BeautifulSoup) to find media references (img
,link
,script
tags) that point to image or media files. - Also extracts
url(...)
references in CSS styles. - For each referenced media file found locally:
- Creates a symlink in a dedicated folder (here named
public_assets
).
- Creates a symlink in a dedicated folder (here named
- Avoids duplicating symlinks if already created.
What it’s useful for:
- Gathering all referenced media files into one central folder via symlinks.
- Simplifying asset paths and access by having one “public assets” directory.
- Useful for deployments, CDN preparation, or bundling assets.
- Keeping original files in place but exposing them via centralized symlinks.
Summary
Script Purpose | What it Does | Use Case |
---|---|---|
Image Reference Mapping | Maps image references and checks for missing images | Finds broken image links and missing assets |
Asset Symlinking & Downloading | Fixes missing assets by symlinking or downloading, creates placeholders | Repairs missing files, automates asset fixes |
Media Symlinking | Collects all referenced media files and symlinks them into a central folder | Simplifies asset management and deployment |
map_images_to_pages.py
import os
import re
from pathlib import Path
from collections import defaultdict
# Set your root directory
root = Path("/home/WEBSITENAME/")
# Supported web file extensions
page_exts = {".html", ".php"}
img_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}
# Image src pattern: <img src="..."> or url('...')
img_pattern = re.compile(r'src=["\']([^"\']+)["\']|url\(([^)]+)\)', re.IGNORECASE)
# Map from image path => list of pages referencing it
image_references = defaultdict(set)
# Track all valid images on disk (absolute and relative to root)
valid_images = set()
# Build list of valid image files
for img in root.rglob("*"):
if img.suffix.lower() in img_exts and img.is_file():
rel = img.relative_to(root)
valid_images.add(str(rel))
valid_images.add("/" + str(rel)) # Handle both formats
# Crawl all .html and .php files
for file in root.rglob("*"):
if file.suffix.lower() in page_exts:
content = file.read_text(errors="ignore")
rel_page = str(file.relative_to(root))
for match in img_pattern.findall(content):
# Combine both capture groups
img_src = match[0] or match[1]
img_src = img_src.strip("\"' ")
# Clean leading "./", "../", or quotes
img_src = os.path.normpath(img_src.lstrip("./"))
# Record reference
image_references[img_src].add(rel_page)
# Build output: image references → valid / missing
missing = []
mapped = []
print("\n=== IMAGE REFERENCE MAPPING ===\n")
for img, pages in sorted(image_references.items()):
found = img in valid_images
pages_str = ", ".join(sorted(pages))
if found:
mapped.append((img, pages))
print(f"[FOUND] {img}\n ↳ {pages_str}\n")
else:
missing.append((img, pages))
print(f"[MISSING] {img}\n ↳ {pages_str}\n")
print("=== SUMMARY ===")
print(f"✓ Total valid images found: {len(mapped)}")
print(f"✗ Total missing/orphaned: {len(missing)}")
print(f"🔎 Total image references: {len(image_references)}")
# Optionally: write results to a report file
with open("image_mapping_report.txt", "w") as f:
for img, pages in sorted(mapped):
f.write(f"[FOUND] {img} -> {', '.join(sorted(pages))}\n")
for img, pages in sorted(missing):
f.write(f"[MISSING] {img} -> {', '.join(sorted(pages))}\n")
help-recursive.py
import os
import re
import shutil
import requests
from pathlib import Path
from urllib.parse import urljoin, urlparse
# CONFIG
project_root = Path("/home/WEBSITENAME") # your actual root
known_remote_prefixes = [
"https://www.youtube.com", "https://fonts.googleapis.com", "https://fonts.gstatic.com"
]
fallback_dir = project_root / "_missing_assets"
fallback_dir.mkdir(exist_ok=True)
# TAGS and ATTRIBUTES to scan
asset_patterns = {
"img": "src",
"script": "src",
"link": "href",
"iframe": "src",
"source": "src",
"video": "src",
"audio": "src",
"object": "data",
}
# EXTENSIONS to look for
web_exts = [".html", ".php"]
# STORAGE
missing_assets = set()
symlinked = 0
downloaded = 0
placeheld = 0
existing = 0
def is_valid_url(link):
return any(link.startswith(p) for p in known_remote_prefixes)
def extract_assets(html_text):
assets = []
for tag, attr in asset_patterns.items():
matches = re.findall(fr'<{tag}[^>]*{attr}=["\'](.*?)["\']', html_text, re.IGNORECASE)
assets.extend(matches)
return assets
def try_find_local_match(filename):
for path in project_root.rglob("*"):
if path.name == filename and path.is_file():
return path
return None
def safe_download(url, dest):
try:
r = requests.get(url, timeout=5)
r.raise_for_status()
dest.write_bytes(r.content)
return True
except Exception as e:
print(f"⚠ Failed to download {url} → {e}")
return False
def create_placeholder(path):
path.write_text("// placeholder file\n")
print("🔎 Scanning HTML and PHP files...\n")
for file in project_root.rglob("*"):
if file.suffix.lower() not in web_exts or not file.is_file():
continue
html = file.read_text(errors="ignore")
asset_paths = extract_assets(html)
for ref in asset_paths:
if not ref or ref.startswith("data:") or ref.startswith("#"):
continue
# Parse and normalize
asset_path = urlparse(ref).path
asset_path = asset_path.lstrip("/") # always relative to web root
asset_file = project_root / asset_path
if asset_file.exists():
existing += 1
continue
missing_assets.add(asset_path)
asset_name = Path(asset_path).name
asset_target_dir = project_root / os.path.dirname(asset_path)
asset_target_dir.mkdir(parents=True, exist_ok=True)
dest_path = asset_target_dir / asset_name
# Try local match
local = try_find_local_match(asset_name)
if local:
try:
dest_path.symlink_to(local.resolve())
print(f"🔗 Symlinked {dest_path} → {local}")
symlinked += 1
continue
except Exception as e:
print(f"❌ Symlink error: {e}")
# Try remote download
for remote_base in known_remote_prefixes:
full_url = urljoin(remote_base + "/", asset_path)
download_dest = fallback_dir / asset_name
if safe_download(full_url, download_dest):
try:
dest_path.symlink_to(download_dest.resolve())
print(f"🌐 Downloaded + Linked {dest_path} ← {full_url}")
downloaded += 1
break
except Exception as e:
print(f"❌ Link fail after download: {e}")
continue
else:
# Final fallback: placeholder
create_placeholder(dest_path)
print(f"📄 Placeholder created for {dest_path}")
placeheld += 1
print("\n✅ Done!")
print(f"✔ Existing assets found: {existing}")
print(f"🔗 Symlinks created: {symlinked}")
print(f"🌍 Files downloaded: {downloaded}")
print(f"📄 Placeholders created: {placeheld}")
print(f"❓ Total missing (unique): {len(missing_assets)}")
map_assets_symlinks.py
import os
import re
from pathlib import Path
from bs4 import BeautifulSoup
# Supported image/media extensions
MEDIA_EXTS = {'.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico'}
# Directory where symlinks to all found media will be stored
SYMLINK_DEST = Path("public_assets")
SYMLINK_DEST.mkdir(exist_ok=True)
# Root directory to scan
ROOT_DIR = Path(".").resolve()
# Patterns to extract src/href/urls
SRC_HREF_REGEX = re.compile(r'''(?:src|href)\s*=\s*["']([^"']+)["']''')
URL_REGEX = re.compile(r'url\(["\']?([^"\')]+)["\']?\)')
def find_media_references(filepath):
"""Extract all image/media paths from a file."""
refs = set()
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
content = f.read()
# Use BeautifulSoup for structured HTML
if filepath.suffix in {".html", ".php"}:
soup = BeautifulSoup(content, "html.parser")
for tag in soup.find_all(["img", "link", "script"]):
for attr in ["src", "href"]:
val = tag.get(attr)
if val and any(val.lower().endswith(ext) for ext in MEDIA_EXTS):
refs.add(val)
# Also check CSS-style `url(...)` refs
refs.update(URL_REGEX.findall(content))
return refs
def normalize_ref(ref, base_path):
"""Resolve relative path refs to absolute ones."""
ref = ref.split("?")[0].split("#")[0] # Strip query/fragments
ref_path = (base_path / ref).resolve()
return ref_path if ref_path.exists() else None
def symlink_media(ref_path):
"""Create a symlink in the public assets folder if it doesn't exist."""
try:
if ref_path.suffix.lower() not in MEDIA_EXTS:
return
target = SYMLINK_DEST / ref_path.name
if not target.exists():
os.symlink(ref_path, target)
print(f"✅ Linked: {target} → {ref_path}")
except Exception as e:
print(f"⚠️ Failed to link {ref_path}: {e}")
def main():
for root, dirs, files in os.walk(ROOT_DIR):
for file in files:
if file.endswith((".html", ".php")):
filepath = Path(root) / file
refs = find_media_references(filepath)
for ref in refs:
ref_path = normalize_ref(ref, filepath.parent)
if ref_path:
symlink_media(ref_path)
if __name__ == "__main__":
main()
UNO REVERSE CARD (always better to make a backup as a tar or zip instead of this…)
repair-migrate.py
import os
import re
from pathlib import Path
# Root directory to scan & fix
ROOT_DIR = Path("/home/WEBSITENAME") # change this to your root
# Extensions of pages to scan
PAGE_EXTS = {".html", ".php"}
# Image file extensions to recognize
IMG_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}
# Regex pattern to find image refs in HTML and CSS styles (src="...", url(...))
IMG_REF_PATTERN = re.compile(
r"""(?:
src\s*=\s*["']([^"']+)["'] | # src="..."
url\(\s*['"]?([^"')]+)['"]?\s*\) # url('...')
)""",
re.IGNORECASE | re.VERBOSE,
)
def find_all_images(root):
"""Build a dict: filename -> list of absolute image paths found on disk"""
images = {}
for img_path in root.rglob("*"):
if img_path.is_file() and img_path.suffix.lower() in IMG_EXTS:
name = img_path.name
images.setdefault(name, []).append(img_path.resolve())
return images
def make_relative_path(from_path, to_path):
"""Make a relative path from from_path parent to to_path"""
return os.path.relpath(to_path, start=from_path.parent)
def fix_image_paths_in_file(file_path, images_on_disk):
"""Scan and fix image references in one HTML/PHP file."""
content = file_path.read_text(encoding="utf-8", errors="ignore")
changed = False
def replacement(match):
# Extract image URL from src or url()
img_ref = match.group(1) or match.group(2)
img_ref_clean = img_ref.split("?")[0].split("#")[0] # Remove query/fragment
img_name = os.path.basename(img_ref_clean)
# Check if referenced path exists relative to page
ref_path = (file_path.parent / img_ref_clean).resolve()
if ref_path.exists():
# Image found at current reference, no change needed
return match.group(0)
# Image file missing at current path, try to find it by filename
candidates = images_on_disk.get(img_name)
if not candidates:
# No image found on disk by that name; keep original
return match.group(0)
# Choose the best candidate — simplest: first found
real_path = candidates[0]
rel_path = make_relative_path(file_path, real_path)
# Replace old image ref with new relative path
# Preserve the attribute (src= or url()) from original text
if match.group(1): # src="..."
new_ref = f'src="{rel_path}"'
else: # url(...)
new_ref = f'url("{rel_path}")'
nonlocal changed
changed = True
return new_ref
# Replace all image refs in file content
new_content = IMG_REF_PATTERN.sub(replacement, content)
if changed:
print(f"Fixed image references in {file_path}")
file_path.write_text(new_content, encoding="utf-8")
def main():
print(f"Scanning {ROOT_DIR} for images...")
images_on_disk = find_all_images(ROOT_DIR)
print(f"Found {sum(len(v) for v in images_on_disk.values())} images on disk.")
# Scan all pages and fix image refs
for page_path in ROOT_DIR.rglob("*"):
if page_path.suffix.lower() in PAGE_EXTS and page_path.is_file():
fix_image_paths_in_file(page_path, images_on_disk)
print("Done fixing image links!")
if __name__ == "__main__":
main()
BONUS SCRIPT:
The following is a script I had custom tailored for my own website. You will need to infer quite a bit to adapt it to your own website. But it might be useful for someone. I’m mostly just putting this here for myself. That’s why it’s at the bottom..
remove-global16.py
import os
import base64
import mimetypes
from pathlib import Path
from urllib.parse import urljoin, quote, unquote
from bs4 import BeautifulSoup
import requests
# === Configuration ===
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
BASE_URL = "https://josefkulovany.com/"
masthead_map = {
"art": "Art _ Josef Kulovany.html",
"construction": "Construction _ Josef Kulovany.html",
"companies": "Companies _ Josef Kulovany.html",
"extracurricular": "Extracurricular _ Josef Kulovany.html",
"engineering-design": "Engineering _ Design _ Josef Kulovany.html",
"art-engineering-economics-design-programming-construction": "Art, Engineering, Economics, Design, Programming, Construction _ Josef Kulovany.html",
"latest-projects": "Latest Projects _ Josef Kulovany.htm"
}
def sanitize_filename(fn):
return quote(fn, safe='').replace('%20', '_').replace('%', '_')
def encode_image(path: Path):
try:
mime, _ = mimetypes.guess_type(path.name)
mime = mime or "application/octet-stream"
with open(path, "rb") as f:
encoded = base64.b64encode(f.read()).decode()
return f"data:{mime};base64,{encoded}"
except:
return None
def fetch_remote_image(url):
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
mime, _ = mimetypes.guess_type(url)
mime = mime or "application/octet-stream"
encoded = base64.b64encode(r.content).decode()
return f"data:{mime};base64,{encoded}"
except:
return None
def fetch_text(url: str) -> str:
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
return r.text
except:
return None
def process_html(file_path: Path):
soup = BeautifulSoup(file_path.read_text(encoding='utf-8', errors='ignore'), "html.parser")
base_path = file_path.parent
if soup.head and not soup.head.find("base"):
soup.head.insert(0, soup.new_tag("base", href="./"))
# Inline styles
for link in soup.find_all("link", rel="stylesheet", href=True):
href = link['href']
content = fetch_text(href) if href.startswith("http") else (base_path / href).read_text(encoding='utf-8', errors='ignore') if (base_path / href).exists() else None
if content:
style = soup.new_tag("style")
style.string = content
link.replace_with(style)
# Inline JS
for script in soup.find_all("script", src=True):
src = script['src']
content = fetch_text(src) if src.startswith("http") else (base_path / src).read_text(encoding='utf-8', errors='ignore') if (base_path / src).exists() else None
if content:
new = soup.new_tag("script")
new.string = content
script.replace_with(new)
# Fix and wrap images
for img in soup.find_all("img", src=True):
src = img['src']
decoded_src = unquote(src)
local = None
if not decoded_src.startswith(("http", "//")):
if decoded_src.startswith("/"):
local = (INPUT_DIR / decoded_src.lstrip('/')).resolve()
else:
local = (base_path / decoded_src).resolve()
data_uri = encode_image(local) if local and local.exists() else fetch_remote_image(urljoin(BASE_URL, decoded_src))
if data_uri:
img['data-orig-src'] = src
img['src'] = data_uri
img.attrs.pop("srcset", None)
else:
img['src'] = urljoin(BASE_URL, decoded_src)
if img.parent.name != "a":
wrapper = soup.new_tag("a", href=img['data-orig-src'] if "data-orig-src" in img.attrs else img['src'])
wrapper['class'] = 'zoomable'
img.wrap(wrapper)
# Lightbox script
script_tag = soup.new_tag("script")
script_tag.string = '''
document.addEventListener("click", function(e) {
const link = e.target.closest("a.zoomable");
if (!link) return;
e.preventDefault();
let existing = document.getElementById("zoom-overlay");
if (existing) existing.remove();
const overlay = document.createElement("div");
overlay.id = "zoom-overlay";
overlay.style = `
position:fixed;top:0;left:0;width:100%;height:100%;
background:rgba(0,0,0,0.8);
display:flex;align-items:center;justify-content:center;
cursor:pointer;
z-index:10000;
`;
const img = document.createElement("img");
img.src = link.href;
img.style = "max-width:90%; max-height:90%; box-shadow: 0 0 10px black;";
overlay.appendChild(img);
overlay.addEventListener("click", () => overlay.remove());
document.body.appendChild(overlay);
});
'''
(soup.body or soup).append(script_tag)
# Update masthead links
for a in soup.find_all("a", href=True):
href = a['href']
if href.startswith(BASE_URL):
path = href[len(BASE_URL):].strip("/").lower()
path = path.split('?')[0].split('#')[0]
if path in masthead_map:
a['href'] = masthead_map[path]
return str(soup)
def recompile_all():
for ext in ("*.html", "*.htm", "*.mhtml"):
for src in INPUT_DIR.rglob(ext):
if OUTPUT_DIR in src.parents:
continue
out_rel = src.relative_to(INPUT_DIR)
out_path = OUTPUT_DIR / out_rel
out_path.parent.mkdir(parents=True, exist_ok=True)
html = process_html(src)
out_path.write_text(html, encoding='utf-8')
print("✓", out_rel)
if __name__ == "__main__":
recompile_all()