Migrating Away from AI-Generated Websites

ai-migrate.zip (40.1 KB)
(for all replace josefkulovany.com or WEBSITENAME with your own website)

Being myself, and often, a canary in the coalmine for others, I was, myself, yet again censored in the most uncouth way; few months back. This time, when Hostinger decided to delete my hosting plan and take my funds, pre-paid over two years in advance, I was left with little choice but to migrate a personal website of mine which had been automatically generated using AI. Being auto-generated by AI, and with no free migration tool readily available, I did what anyone would do: I created my own scripts for migration.

And while these scripts didn’t entirely work for me, they did pretty darn well work for me, and so it follows that they might be good fodder for you, dear user, should mooks like Hostinger ever give you the boot, too. God bless.

My manually migrated portfolio: https://josefkulovany.com/

This suite of scripts assumes you have already created a local copy of your website using the browser’s ‘save as’ option under file and/or your right click, save as. This means your files are already in a folder locally. This local folder appears to be good to go, but often will reference globally saved images on the CDN. This means that, even if you download images locally, they are still referenced in a long boiler plate website. This means your simple website is in fact consisting of so much code referencing so many other websites it is rendered useless should you ever choose to leave. This means as soon as you stop paying them, your images will stop showing up even though you’ve saved them locally. Never fear, zCHG.org is here…

The following scripts can easily be adapted for other AI website generators e.g. Wix. Just copy/paste a given script into a bot’s prompt, and prompt with something like “Adapt the following script for my website” after looking at the source code of your website for clues of the CDN being utilized by that particular AI website generator.

You can also run more than one script to a given website as a kind of filter. Say after you’ve parsed a given website, now you want to minify as a second parse, you can do that. Useful. Useful because, although you may have achieved stand-alone, you did so keeping too much boiler plate. Useful because you loved the design but hated the frill. Useful, because it’s YOUR property now, and why would anyone want property that isn’t easy to edit?

hybrid_recompile.py (untested) - This was suggested to me after the project was done, it probably works mostly pretty good. It combines all five of the other script’s best attributes per the table.

import os
import re
import shutil
import base64
import mimetypes
import requests
from pathlib import Path
from urllib.parse import urljoin, quote

from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin

BASE_URL = "https://josefkulovany.com/"
SCRIPT_DIR = Path(__file__).resolve().parent
SRC_DIR = SCRIPT_DIR
OUT_DIR = SCRIPT_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(parents=True, exist_ok=True)
ASSETS_DIR.mkdir(parents=True, exist_ok=True)

COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif", ".png", ".jpg", ".jpeg"}

def read_file(path):
    return path.read_text(encoding="utf-8", errors="ignore")

def write_file(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")

def sanitize_filename(filename):
    return quote(filename, safe='').replace('%20', '_').replace('%', '_')

def copy_asset(src_path):
    src_path = src_path.resolve()
    if not src_path.exists() or not src_path.is_file():
        return None
    dest_name = src_path.name
    dest_path = ASSETS_DIR / dest_name
    counter = 1
    while dest_path.exists():
        stem = src_path.stem
        suffix = src_path.suffix
        dest_name = f"{stem}_{counter}{suffix}"
        dest_path = ASSETS_DIR / dest_name
        counter += 1
    shutil.copy2(src_path, dest_path)
    return Path("assets") / dest_name

def encode_image_base64(local_path):
    try:
        mime, _ = mimetypes.guess_type(str(local_path))
        if mime is None:
            mime = "application/octet-stream"
        b64_data = base64.b64encode(local_path.read_bytes()).decode('utf-8')
        return f"data:{mime};base64,{b64_data}"
    except Exception:
        return None

def inline_css(css_path, base_path, processed=None):
    if processed is None:
        processed = set()
    css_path = (base_path / css_path).resolve()
    if not css_path.exists() or css_path in processed:
        return ""
    processed.add(css_path)

    css_text = read_file(css_path)

    # Recursive @import
    def import_replacer(match):
        import_url = match.group(1).strip("'\"")
        if import_url.startswith("http") or import_url.startswith("data:"):
            return match.group(0)
        return inline_css(import_url, css_path.parent, processed)

    css_text = re.sub(r'@import\s+url\(([^)]+)\);?', import_replacer, css_text)
    css_text = re.sub(r'@import\s+["\']([^"\']+)["\'];?', import_replacer, css_text)

    def url_replacer(match):
        orig_url = match.group(1).strip("'\"")
        if orig_url.startswith("data:") or orig_url.startswith("http"):
            return f"url({orig_url})"
        asset_path = (css_path.parent / orig_url).resolve()
        copied = copy_asset(asset_path)
        return f"url({copied.as_posix()})" if copied else f"url({orig_url})"

    css_text = re.sub(r'url\(([^)]+)\)', url_replacer, css_text)
    return compress(css_text)

def process_html(html_path):
    soup = BeautifulSoup(read_file(html_path), "html.parser")
    base_path = html_path.parent

    if not soup.find("base"):
        soup.head.insert(0, soup.new_tag("base", href="./"))

    # Inline CSS
    for link in soup.find_all("link", rel="stylesheet"):
        href = link.get("href", "")
        if href.startswith("http"):
            link.decompose()
            continue
        try:
            css_code = inline_css(href, base_path)
            style_tag = soup.new_tag("style")
            style_tag.string = css_code
            link.replace_with(style_tag)
        except:
            link.decompose()

    # Inline JS
    for script in soup.find_all("script", src=True):
        src = script["src"]
        if src.startswith("http"):
            script.decompose()
            continue
        js_path = (base_path / src).resolve()
        try:
            js_code = jsmin(read_file(js_path))
            new_script = soup.new_tag("script")
            new_script.string = js_code
            script.replace_with(new_script)
        except:
            script.decompose()

    # Inline images or fallback to BASE_URL
    for img in soup.find_all("img", src=True):
        src = img["src"]
        img_path = (base_path / src).resolve()
        if img_path.exists():
            data_uri = encode_image_base64(img_path)
            if data_uri:
                img["src"] = data_uri
                img.attrs.pop("srcset", None)
                continue
        img["src"] = urljoin(BASE_URL, src)

    # Other assets (audio, video, source, iframe, etc.)
    for tag in soup.find_all(src=True):
        src = tag["src"]
        if src.startswith("http") or src.startswith("data:"):
            continue
        asset_path = (base_path / src).resolve()
        copied = copy_asset(asset_path)
        tag["src"] = copied.as_posix() if copied else urljoin(BASE_URL, src)

    # Fonts/icons/etc
    for tag in soup.find_all(href=True):
        href = tag["href"]
        if href.startswith(("http", "data:", "#")):
            continue
        asset_path = (base_path / href).resolve()
        ext = asset_path.suffix.lower()
        copied = copy_asset(asset_path) if ext in COPY_EXTENSIONS else None
        tag["href"] = copied.as_posix() if copied else urljoin(BASE_URL, href)

    return minify(str(soup), remove_empty_space=True, remove_comments=True)

def recompile_all():
    for html_path in SRC_DIR.rglob("*.html"):
        rel_path = html_path.relative_to(SRC_DIR)
        out_path = OUT_DIR / rel_path
        try:
            compiled = process_html(html_path)
            write_file(out_path, compiled)
            print(f"✅ Compiled: {rel_path}")
        except Exception as e:
            print(f"❌ Error processing {rel_path}: {e}")

if __name__ == "__main__":
    recompile_all()
Feature / Script remove-global1.py remove-global2.py remove-global3.py structure-and-image.py structure_capture.py hybrid_recompile.py
Purpose Replace Zyro image URLs Simple asset inliner Full local site compiler Image inliner + URL rewiring Live site resource rewiring :fire: Full hybrid compiler
CSS Inlining :cross_mark: :white_check_mark: (basic) :white_check_mark: (recursive @import) :cross_mark: :white_check_mark: (remote optional) :white_check_mark: (recursive @import)
JS Inlining :cross_mark: :white_check_mark: :white_check_mark: :cross_mark: :white_check_mark: (remote optional) :white_check_mark:
IMG Inlining (base64) :cross_mark: :white_check_mark: :white_check_mark: :white_check_mark: (local, fallback to URL) :white_check_mark: (remote optional) :white_check_mark: (local, fallback to URL)
Fallback to Live URL :cross_mark: :cross_mark: :cross_mark: :white_check_mark: :white_check_mark: :white_check_mark: (for missing or skipped assets)
Minify HTML :cross_mark: :white_check_mark: :white_check_mark: :cross_mark: :cross_mark: :white_check_mark:
Asset Copying (fonts, etc.) :cross_mark: :white_check_mark: :white_check_mark: (collision safe) :cross_mark: :cross_mark: :white_check_mark: (collision safe)
Handles CSS @import recursively :cross_mark: :cross_mark: :white_check_mark: :cross_mark: :white_check_mark: (remote-only) :white_check_mark:
Filename Deduplication in /assets :cross_mark: :cross_mark: :white_check_mark: :cross_mark: :cross_mark: :white_check_mark:
Processes Remote Resources (live fetch) :cross_mark: :cross_mark: :cross_mark: :cross_mark: :white_check_mark: (via requests) :warning: (fallback URL only, no fetch)
Flat vs Tree Output Structure Tree Tree Tree Flat Flat Tree
Base <base href="./"> Injection :cross_mark: :cross_mark: :cross_mark: :white_check_mark: :white_check_mark: :white_check_mark:
Dependencies Minimal (regex, pathlib) bs4, htmlmin, jsmin bs4, htmlmin, jsmin bs4 bs4, requests All above + mimetypes, requests
Ideal Use Case Pre-clean Zyro exports Local-only quick builds Full self-contained site Portable preview with live fallback Mirroring or syncing to live site :fire: Robust offline+online export tool

The following is a script for repairing broken images. No idea why, but my own homepage was the only page I couldn’t resurrect in full. The images were broken base64. Base64 strings you say? More on this later on.. Any case, broken base64 strings do no good, so this handy dandy tool replaces all images in a given page (index.html) with images which are stored in the same folder as our target file (index.html) and script. It’s useful for that.

repair-2.py

from bs4 import BeautifulSoup
from pathlib import Path

# Input/Output HTML
html_path = Path("/home/josefkulovany/recompiled-pages/index.html")
output_path = html_path.with_name("index_manual_mapped.html")

# Your real files
real_images = [
    "272097746_2889050794718330_2429556499248538454_n-Yg21D3gZlKtaKOEq.jpg",
    "272170262_2889010071389069_839111276987981319_n-YbNDK81XJoflBe4N.jpg",
    "293256215_3011295135827228_1062208197907635003_n-AoPEOLNpbxSvk2Dr.jpg",
    "311983307_3096503063973101_4624707471446359481_n-mk3Leaz0bls5W4lq.jpg",
    "312918800_3096502983973109_8016347582259544278_n-Yg21DZPzj9FEQra2.jpg",
    "313425110_3097484513874956_5709197671429153513_n-YbNDKoq75WTWyeRZ.jpg",
    "img_2282-Yg21DoElqKu6lW0g.JPG",
    "unadjustednonraw_thumb_e21-YKboBnV1RpsjBGW2.jpg",
    "untitled-meP38MLj8QhL60qL.png"
]

# Load HTML
soup = BeautifulSoup(html_path.read_text(encoding="utf-8"), "html.parser")

# Replace base64 images
for i, img in enumerate(soup.find_all("img")):
    src = img.get("src", "")
    if src.startswith("data:image"):
        print(f"\n🔍 Found base64 image #{i+1}")
        print("Nearby context:")
        print("-" * 40)
        parent = img.find_parent()
        if parent:
            snippet = parent.get_text().strip()
            print(snippet[:300])
        print("-" * 40)

        # Show options
        print("Choose replacement image (or 's' to skip, 'q' to quit):")
        for idx, name in enumerate(real_images):
            print(f"{idx + 1}. {name}")
        
        while True:
            choice = input("Your choice [number | s | q]: ").strip().lower()
            if choice == 's':
                print("⏭️  Skipped.")
                break
            elif choice == 'q':
                print("👋 Exiting early. Saving progress...")
                output_path.write_text(str(soup), encoding="utf-8")
                exit(0)
            elif choice.isdigit():
                num = int(choice)
                if 1 <= num <= len(real_images):
                    replacement = real_images[num - 1]
                    img['src'] = f"./{replacement}"
                    print(f"✅ Replaced with: {replacement}")
                    break
            print("❌ Invalid input. Try again.")

# Save output
output_path.write_text(str(soup), encoding="utf-8")
print(f"\n🎉 Finished. Updated file saved to: {output_path}")

The following does a great job of capturing the structure of a given website, but lost track of images in my case:

structure_capture.py

import os
from bs4 import BeautifulSoup
from pathlib import Path
import requests
from urllib.parse import urljoin, urlparse, quote
import base64

# Use the actual location of the script
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
BASE_URL = "https://josefkulovany.com/"
EMBED_RESOURCES = False  # Set to True to inline CSS/images as base64

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def sanitize_filename(filename):
    return quote(filename, safe='').replace('%20', '_').replace('%', '_')

def embed_resource(url):
    try:
        res = requests.get(url)
        res.raise_for_status()
        content_type = res.headers.get("Content-Type", "application/octet-stream")
        data = base64.b64encode(res.content).decode('utf-8')
        return f"data:{content_type};base64,{data}"
    except Exception as e:
        print(f"Error embedding {url}: {e}")
        return url

def process_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Adjust base tag
    base_tag = soup.find("base")
    if base_tag:
        base_tag["href"] = "./"
    else:
        soup.head.insert(0, soup.new_tag("base", href="./"))

    # Fix resource links
    for tag, attr in [("link", "href"), ("script", "src"), ("img", "src")]:
        for el in soup.find_all(tag):
            if el.has_attr(attr):
                original_url = el[attr]
                full_url = urljoin(BASE_URL, original_url)
                if EMBED_RESOURCES and tag in ["link", "img"]:
                    if tag == "link" and "stylesheet" in el.get("rel", []):
                        try:
                            css = requests.get(full_url).text
                            style_tag = soup.new_tag("style")
                            style_tag.string = css
                            el.replace_with(style_tag)
                        except:
                            continue
                    elif tag == "img":
                        el[attr] = embed_resource(full_url)
                else:
                    el[attr] = full_url

    return str(soup)

def recompile_all():
    for file in INPUT_DIR.glob("*.html"):
        sanitized = sanitize_filename(file.name)
        output_file = OUTPUT_DIR / sanitized
        try:
            compiled_html = process_html(file)
            with open(output_file, "w", encoding="utf-8") as out:
                out.write(compiled_html)
            print(f"✔️ Recompiled: {file.name} -> {output_file.name}")
        except Exception as e:
            print(f"❌ Error processing {file.name}: {e}")

if __name__ == "__main__":
    recompile_all()

The following is the best generic structure + image scrape I have. Everything beyond this will need to be custom tailored to your website (scroll down for example)

structure-and-image.py

import os
from pathlib import Path
from urllib.parse import urljoin, quote
import base64
import mimetypes

from bs4 import BeautifulSoup

# Paths
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://josefkulovany.com/"

def sanitize_filename(filename):
    return quote(filename, safe='').replace('%20', '_').replace('%', '_')

def encode_image_to_base64(local_path):
    try:
        mime, _ = mimetypes.guess_type(str(local_path))
        if mime is None:
            mime = "application/octet-stream"
        with open(local_path, "rb") as f:
            b64_data = base64.b64encode(f.read()).decode('utf-8')
        return f"data:{mime};base64,{b64_data}"
    except Exception:
        return None

def process_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Ensure <base href="./"> for relative links
    if not soup.find("base"):
        soup.head.insert(0, soup.new_tag("base", href="./"))

    base_path = file_path.parent

    # For link and script tags: rewrite URLs to live absolute URLs (preserve site structure and CSS/JS loading)
    for tag, attr in [("link", "href"), ("script", "src")]:
        for el in soup.find_all(tag):
            if el.has_attr(attr):
                original_url = el[attr]
                # Make absolute URL on live site
                live_url = urljoin(BASE_URL, original_url)
                el[attr] = live_url

    # For img tags: try local file, inline base64 if possible, else fallback to live absolute URL
    for img in soup.find_all("img", src=True):
        orig_src = img["src"]
        local_img_path = (base_path / orig_src).resolve()

        if local_img_path.exists() and local_img_path.is_file():
            data_uri = encode_image_to_base64(local_img_path)
            if data_uri:
                img["src"] = data_uri
                # Remove srcset if present because it conflicts with data URI
                if "srcset" in img.attrs:
                    del img.attrs["srcset"]
            else:
                # Could not encode, fallback to live URL
                img["src"] = urljoin(BASE_URL, orig_src)
        else:
            # No local file, use live URL
            img["src"] = urljoin(BASE_URL, orig_src)

    return str(soup)

def recompile_all():
    for file in INPUT_DIR.glob("*.html"):
        sanitized = sanitize_filename(file.name)
        output_file = OUTPUT_DIR / sanitized
        try:
            compiled_html = process_html(file)
            with open(output_file, "w", encoding="utf-8") as out:
                out.write(compiled_html)
            print(f"✔️ Recompiled: {file.name} -> {output_file.name}")
        except Exception as e:
            print(f"❌ Error processing {file.name}: {e}")

if __name__ == "__main__":
    recompile_all()

These are older versions of the same thing. They are more generic fodder.

:white_check_mark: remove-global1.pyTargeted Zyro CDN Image Replacer

Usefulness:

  • Specifically designed to replace remote Zyro-hosted image URLs (https://assets.zyrosite.com/...) with local equivalents.
  • It scans HTML/PHP files, searches for Zyro image URLs, and replaces them if a matching file is found locally.

Key Features:

  • Minimal and fast.
  • Uses re (regex) for pattern matching and Path for filesystem navigation.
  • Only focuses on remote image replacement, not inlining or asset compression.

Best For:

  • Cleaning up remote references from a Zyro-exported site.
  • Light preprocessing before more aggressive compilation/inlining.

:white_check_mark: remove-global2.pyBasic Inliner and Recompiler

Usefulness:

  • Inlines local CSS, JS, and IMG files.
  • Copies non-inlined assets (fonts, media, icons) to a centralized assets/ folder.
  • Removes external references (e.g., to CDN scripts or styles).
  • Minifies the resulting HTML.

Key Features:

  • Clean, readable structure.
  • Handles basic inlining and asset copying.
  • No recursion on @import in CSS.
  • Less robust in file name conflict resolution (e.g., assumes unique asset names).

Best For:

  • Quick inlining and cleanup of a local site.
  • Small projects or simple export workflows.

:white_check_mark: remove-global3.pyAdvanced Recursive Inliner with Conflict Handling

Usefulness:

  • Recursively inlines @import statements in CSS files.
  • Automatically resolves filename conflicts in assets/ by renaming.
  • Extends asset support to images and more media types (e.g., .jpg, .png, .jpeg).
  • More fault-tolerant and comprehensive than v2.

Key Features:

  • Prevents infinite recursion in CSS @imports using a set of processed files.
  • Resolves duplicate asset names by appending numeric suffixes (e.g., font_1.woff).
  • More complete COPY_EXTENSIONS set than v2.
  • More robust and production-safe.

Best For:

  • Full site compilation with complex CSS imports.
  • Real-world exports where duplicate asset names are common.
  • Handling large or messy input folders.

remove-global1.py

import re
from pathlib import Path

# Root folder of your website files
ROOT = Path("/home/WEBSITENAME")

# Folder(s) where local images reside (adjust if images scattered elsewhere)
LOCAL_IMAGE_FOLDERS = [
    ROOT / "images",
    ROOT / "assets" / "images",
    # add more if needed
]

# Regex to find remote Zyro CDN URLs in img src or other tags
REMOTE_IMG_URL_PATTERN = re.compile(
    r'https://assets\.zyrosite\.com/[^"\']+'
)

def find_local_image(filename):
    """Search all LOCAL_IMAGE_FOLDERS for a file with given filename"""
    for folder in LOCAL_IMAGE_FOLDERS:
        candidate = folder / filename
        if candidate.exists():
            return candidate.relative_to(ROOT)
    # fallback: try anywhere under ROOT
    for candidate in ROOT.rglob(filename):
        if candidate.is_file():
            return candidate.relative_to(ROOT)
    return None

def fix_file(file_path):
    content = file_path.read_text(encoding="utf-8", errors="ignore")
    changed = False
    urls = set(REMOTE_IMG_URL_PATTERN.findall(content))

    for url in urls:
        filename = url.split("/")[-1].split("?")[0].split("#")[0]
        local_rel_path = find_local_image(filename)
        if local_rel_path:
            # Use forward slashes in HTML paths
            local_rel_str = "/" + str(local_rel_path).replace("\\", "/")
            content = content.replace(url, local_rel_str)
            changed = True
            print(f"Replaced {url} → {local_rel_str} in {file_path}")
        else:
            print(f"⚠️ WARNING: local image file not found for {filename} referenced in {file_path}")

    if changed:
        file_path.write_text(content, encoding="utf-8")

def main():
    print(f"Scanning {ROOT} for .php and .html files to fix image URLs...\n")
    for file_path in ROOT.rglob("*"):
        if file_path.suffix.lower() in {".php", ".html"}:
            fix_file(file_path)
    print("\nAll done!")

if __name__ == "__main__":
    main()

remove-global2.py

import os
import shutil
import base64
from pathlib import Path
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin

SRC_DIR = Path("/home/WEBSITENAME").resolve()
OUT_DIR = SRC_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(exist_ok=True, parents=True)
ASSETS_DIR.mkdir(exist_ok=True, parents=True)

# Asset file extensions to copy (non-inlined)
COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif"}

def read_file(path):
    return path.read_text(encoding="utf-8", errors="ignore")

def write_file(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")

def copy_asset(src_path):
    """
    Copy a local asset into the assets folder if not already copied.
    Returns the relative path inside OUT_DIR to use in rewritten HTML.
    """
    src_path = src_path.resolve()
    if not src_path.exists() or not src_path.is_file():
        return None
    dest_path = ASSETS_DIR / src_path.name
    if not dest_path.exists():
        shutil.copy2(src_path, dest_path)
    return Path("assets") / src_path.name

def inline_css(css_path, base_path):
    css_path = (base_path / css_path).resolve()
    if not css_path.exists():
        return ""
    css_text = read_file(css_path)

    # Find url(...) references inside CSS to copy assets and rewrite URLs
    def replace_url(match):
        url = match.group(1).strip('\'"')
        if url.startswith("data:") or url.startswith("http"):
            return f"url({url})"
        asset_path = (css_path.parent / url).resolve()
        copied = copy_asset(asset_path)
        if copied:
            return f"url({copied.as_posix()})"
        return f"url({url})"

    import re
    css_text = re.sub(r"url\(([^)]+)\)", replace_url, css_text)

    return compress(css_text)

def inline_assets(html_path):
    html_path = html_path.resolve()
    html = read_file(html_path)
    soup = BeautifulSoup(html, "html.parser")

    base_path = html_path.parent

    # Inline <link rel=stylesheet>
    for link in soup.find_all("link", rel="stylesheet"):
        href = link.get("href", "")
        if href.startswith("http"):
            # External URL - remove or keep? Here, remove
            link.decompose()
            continue
        try:
            css_code = inline_css(href, base_path)
            style_tag = soup.new_tag("style")
            style_tag.string = css_code
            link.replace_with(style_tag)
        except Exception:
            link.decompose()

    # Inline <script src=...>
    for script in soup.find_all("script", src=True):
        src = script["src"]
        if src.startswith("http"):
            script.decompose()
            continue
        js_path = (base_path / src).resolve()
        if js_path.exists():
            try:
                js_code = jsmin(read_file(js_path))
                new_script = soup.new_tag("script")
                new_script.string = js_code
                script.replace_with(new_script)
            except Exception:
                script.decompose()
        else:
            script.decompose()

    # Inline <img src=...>
    for img in soup.find_all("img", src=True):
        src = img["src"]
        if src.startswith("http"):
            # External - remove or keep? Remove here
            img.decompose()
            continue
        img_path = (base_path / src).resolve()
        if img_path.exists() and img_path.is_file():
            try:
                mime = f"image/{img_path.suffix.lstrip('.')}"
                b64 = base64.b64encode(img_path.read_bytes()).decode()
                img["src"] = f"data:{mime};base64,{b64}"
                # Also remove srcset attribute if present
                if "srcset" in img.attrs:
                    del img.attrs["srcset"]
            except Exception:
                img.decompose()
        else:
            img.decompose()

    # Rewrite other asset references in tags (audio, video, source, etc.)
    for tag in soup.find_all(src=True):
        src = tag["src"]
        if src.startswith("http") or src.startswith("data:"):
            continue
        asset_path = (base_path / src).resolve()
        copied = copy_asset(asset_path)
        if copied:
            tag["src"] = copied.as_posix()
        else:
            # Remove tags referencing missing assets
            tag.decompose()

    # Rewrite href assets (fonts, icons, etc.)
    for tag in soup.find_all(href=True):
        href = tag["href"]
        if href.startswith("http") or href.startswith("data:") or href.startswith("#"):
            continue
        asset_path = (base_path / href).resolve()
        ext = asset_path.suffix.lower()
        if ext in COPY_EXTENSIONS:
            copied = copy_asset(asset_path)
            if copied:
                tag["href"] = copied.as_posix()
            else:
                tag.decompose()

    # Minify final HTML output
    minified_html = minify(str(soup), remove_empty_space=True, remove_comments=True)

    return minified_html

def process_all_pages():
    for root, _, files in os.walk(SRC_DIR):
        root_path = Path(root)
        # Skip output folders to avoid recursion
        if root_path == OUT_DIR or root_path.is_relative_to(OUT_DIR):
            continue
        for file in files:
            if file.lower().endswith((".html", ".php")):
                html_path = root_path / file
                rel_path = html_path.relative_to(SRC_DIR)
                out_path = OUT_DIR / rel_path
                try:
                    compiled_html = inline_assets(html_path)
                    write_file(out_path.with_suffix(".html"), compiled_html)
                    print(f"✅ Compiled: {out_path.with_suffix('.html')}")
                except Exception as e:
                    print(f"❌ Error processing {html_path}: {e}")

if __name__ == "__main__":
    process_all_pages()

remove-global3.py

import os
import re
import shutil
import base64
from pathlib import Path

from bs4 import BeautifulSoup
from htmlmin import minify
from csscompressor import compress
from jsmin import jsmin

SRC_DIR = Path("/home/WEBSITENAME").resolve()
OUT_DIR = SRC_DIR / "recompiled-pages"
ASSETS_DIR = OUT_DIR / "assets"
OUT_DIR.mkdir(exist_ok=True, parents=True)
ASSETS_DIR.mkdir(exist_ok=True, parents=True)

COPY_EXTENSIONS = {".woff", ".woff2", ".ttf", ".eot", ".otf", ".mp4", ".mp3", ".ico", ".svg", ".webp", ".bmp", ".gif", ".png", ".jpg", ".jpeg"}

def read_file(path):
    return path.read_text(encoding="utf-8", errors="ignore")

def write_file(path, content):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")

def copy_asset(src_path):
    src_path = src_path.resolve()
    if not src_path.exists() or not src_path.is_file():
        return None
    # Create a unique filename in assets folder to avoid collisions
    dest_name = src_path.name
    dest_path = ASSETS_DIR / dest_name
    counter = 1
    while dest_path.exists():
        # If collision, add suffix number
        stem = src_path.stem
        suffix = src_path.suffix
        dest_name = f"{stem}_{counter}{suffix}"
        dest_path = ASSETS_DIR / dest_name
        counter += 1
    shutil.copy2(src_path, dest_path)
    return Path("assets") / dest_name

def inline_css(css_path, base_path, processed_files=None):
    """
    Read CSS file, recursively inline @import CSS,
    fix URLs to copied assets, and compress.
    """
    if processed_files is None:
        processed_files = set()

    css_path = (base_path / css_path).resolve()
    if css_path in processed_files:
        return ""  # Avoid circular imports
    processed_files.add(css_path)

    if not css_path.exists():
        return ""

    css_text = read_file(css_path)

    # Handle @import statements recursively
    def import_replacer(match):
        import_url = match.group(1).strip('\'"')
        # Only process local imports
        if import_url.startswith("http") or import_url.startswith("data:"):
            return match.group(0)
        imported_css = inline_css(import_url, css_path.parent, processed_files)
        return imported_css

    css_text = re.sub(r'@import\s+url\(([^)]+)\);?', import_replacer, css_text)
    css_text = re.sub(r'@import\s+["\']([^"\']+)["\'];?', import_replacer, css_text)

    # Fix url(...) paths: copy assets and rewrite URLs to new assets folder
    def url_replacer(match):
        orig_url = match.group(1).strip('\'"')
        if orig_url.startswith("data:") or orig_url.startswith("http"):
            return f"url({orig_url})"
        asset_path = (css_path.parent / orig_url).resolve()
        copied = copy_asset(asset_path)
        if copied:
            return f"url({copied.as_posix()})"
        else:
            return f"url({orig_url})"

    css_text = re.sub(r'url\(([^)]+)\)', url_replacer, css_text)

    # Compress CSS
    return compress(css_text)

def inline_assets(html_path):
    html_path = html_path.resolve()
    html = read_file(html_path)
    soup = BeautifulSoup(html, "html.parser")
    base_path = html_path.parent

    # Inline <link rel=stylesheet> (CSS)
    for link in soup.find_all("link", rel="stylesheet"):
        href = link.get("href", "")
        if href.startswith("http"):
            # External CSS: skip or remove?
            link.decompose()
            continue
        try:
            css_code = inline_css(href, base_path)
            style_tag = soup.new_tag("style")
            style_tag.string = css_code
            link.replace_with(style_tag)
        except Exception:
            link.decompose()

    # Inline <script src=...>
    for script in soup.find_all("script", src=True):
        src = script["src"]
        if src.startswith("http"):
            # Remove external scripts (or keep if needed)
            script.decompose()
            continue
        js_path = (base_path / src).resolve()
        if js_path.exists():
            try:
                js_code = jsmin(read_file(js_path))
                new_script = soup.new_tag("script")
                new_script.string = js_code
                script.replace_with(new_script)
            except Exception:
                script.decompose()
        else:
            script.decompose()

    # Inline images <img>
    for img in soup.find_all("img", src=True):
        src = img["src"]
        if src.startswith("http"):
            img.decompose()
            continue
        img_path = (base_path / src).resolve()
        if img_path.exists() and img_path.is_file():
            try:
                mime = f"image/{img_path.suffix.lstrip('.')}"
                b64 = base64.b64encode(img_path.read_bytes()).decode()
                img["src"] = f"data:{mime};base64,{b64}"
                if "srcset" in img.attrs:
                    del img.attrs["srcset"]
            except Exception:
                img.decompose()
        else:
            img.decompose()

    # Rewrite other src attributes (audio, video, source, iframe, etc)
    for tag in soup.find_all(src=True):
        src = tag["src"]
        if src.startswith("http") or src.startswith("data:"):
            continue
        asset_path = (base_path / src).resolve()
        copied = copy_asset(asset_path)
        if copied:
            tag["src"] = copied.as_posix()
        else:
            tag.decompose()

    # Rewrite href attributes for fonts, icons, etc.
    for tag in soup.find_all(href=True):
        href = tag["href"]
        if href.startswith("http") or href.startswith("data:") or href.startswith("#"):
            continue
        asset_path = (base_path / href).resolve()
        ext = asset_path.suffix.lower()
        if ext in COPY_EXTENSIONS:
            copied = copy_asset(asset_path)
            if copied:
                tag["href"] = copied.as_posix()
            else:
                tag.decompose()

    # Minify final HTML output
    minified_html = minify(str(soup), remove_empty_space=True, remove_comments=True)

    return minified_html

def process_all_pages():
    for root, _, files in os.walk(SRC_DIR):
        root_path = Path(root)
        # Skip output folder to prevent recursion
        if root_path == OUT_DIR or OUT_DIR in root_path.parents:
            continue
        for file in files:
            if file.lower().endswith((".html", ".php")):
                html_path = root_path / file
                rel_path = html_path.relative_to(SRC_DIR)
                out_path = OUT_DIR / rel_path
                try:
                    compiled_html = inline_assets(html_path)
                    write_file(out_path.with_suffix(".html"), compiled_html)
                    print(f"✅ Compiled: {out_path.with_suffix('.html')}")
                except Exception as e:
                    print(f"❌ Error processing {html_path}: {e}")

if __name__ == "__main__":
    process_all_pages()

SYMLINK TOOLS

The following are precursors to the main above scripts, scroll down below them for our UNO REVERSE card, which undoes these script’s effects, more or less..

  • map_images_to_pages.py: scanning pages and mapping image references (detects missing vs found images).
  • help-recursive.py: scanning pages for asset references, creating symlinks to local files if missing, trying remote downloads, or creating placeholders.
  • map_assets_symlinks.py: scanning pages and media references to create symlinks for all found media in a centralized folder.

1. Image Reference Mapping Script

What it does:

  • Recursively scans your entire project folder for:
    • Web page files (.html, .php)
    • Image files (.jpg, .png, .svg, etc.)
  • Extracts all image references (src="..." or url(...)) inside the web pages.
  • Checks if each referenced image actually exists on disk.
  • Builds a report listing:
    • Found images that exist and which pages reference them.
    • Missing images that are referenced but not found.
  • Outputs a summary and writes the detailed report to image_mapping_report.txt.

What it’s useful for:

  • Identifying broken or missing image links in your web project.
  • Understanding which pages reference which images.
  • Helping clean up or fix missing asset issues.
  • Preparing for asset reorganization or migration.

2. Asset Symlinking, Downloading, and Placeholder Script

What it does:

  • Scans .html and .php files for common asset references (images, scripts, stylesheets, iframes, videos, audio, objects).
  • For each asset reference, it:
    • Checks if the asset exists locally.
    • If missing, tries to find a local file with the same filename anywhere in the project.
    • If still missing, tries downloading from known remote prefixes (like YouTube, Google Fonts).
    • If download fails, creates a placeholder file.
  • Creates symbolic links from the expected asset location to the found/downloaded/placeholder asset.
  • Tracks statistics on how many symlinks, downloads, placeholders, and existing assets it handled.

What it’s useful for:

  • Automatically repairing broken or missing asset links by linking or downloading assets.
  • Creating placeholders so the site still works even if assets are missing.
  • Simplifying asset management by centralizing missing assets into fallback directories.
  • Useful during migration, deployment, or cleaning up incomplete projects.

3. Media Symlinking Script

What it does:

  • Walks through the project directory recursively.
  • For each .html or .php file, parses it (using BeautifulSoup) to find media references (img, link, script tags) that point to image or media files.
  • Also extracts url(...) references in CSS styles.
  • For each referenced media file found locally:
    • Creates a symlink in a dedicated folder (here named public_assets).
  • Avoids duplicating symlinks if already created.

What it’s useful for:

  • Gathering all referenced media files into one central folder via symlinks.
  • Simplifying asset paths and access by having one “public assets” directory.
  • Useful for deployments, CDN preparation, or bundling assets.
  • Keeping original files in place but exposing them via centralized symlinks.

Summary

Script Purpose What it Does Use Case
Image Reference Mapping Maps image references and checks for missing images Finds broken image links and missing assets
Asset Symlinking & Downloading Fixes missing assets by symlinking or downloading, creates placeholders Repairs missing files, automates asset fixes
Media Symlinking Collects all referenced media files and symlinks them into a central folder Simplifies asset management and deployment

map_images_to_pages.py

import os
import re
from pathlib import Path
from collections import defaultdict

# Set your root directory
root = Path("/home/WEBSITENAME/")

# Supported web file extensions
page_exts = {".html", ".php"}
img_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}

# Image src pattern: <img src="..."> or url('...')
img_pattern = re.compile(r'src=["\']([^"\']+)["\']|url\(([^)]+)\)', re.IGNORECASE)

# Map from image path => list of pages referencing it
image_references = defaultdict(set)

# Track all valid images on disk (absolute and relative to root)
valid_images = set()

# Build list of valid image files
for img in root.rglob("*"):
    if img.suffix.lower() in img_exts and img.is_file():
        rel = img.relative_to(root)
        valid_images.add(str(rel))
        valid_images.add("/" + str(rel))  # Handle both formats

# Crawl all .html and .php files
for file in root.rglob("*"):
    if file.suffix.lower() in page_exts:
        content = file.read_text(errors="ignore")
        rel_page = str(file.relative_to(root))

        for match in img_pattern.findall(content):
            # Combine both capture groups
            img_src = match[0] or match[1]
            img_src = img_src.strip("\"' ")

            # Clean leading "./", "../", or quotes
            img_src = os.path.normpath(img_src.lstrip("./"))

            # Record reference
            image_references[img_src].add(rel_page)

# Build output: image references → valid / missing
missing = []
mapped = []

print("\n=== IMAGE REFERENCE MAPPING ===\n")
for img, pages in sorted(image_references.items()):
    found = img in valid_images
    pages_str = ", ".join(sorted(pages))

    if found:
        mapped.append((img, pages))
        print(f"[FOUND]   {img}\n          ↳ {pages_str}\n")
    else:
        missing.append((img, pages))
        print(f"[MISSING] {img}\n          ↳ {pages_str}\n")

print("=== SUMMARY ===")
print(f"✓ Total valid images found:   {len(mapped)}")
print(f"✗ Total missing/orphaned:     {len(missing)}")
print(f"🔎 Total image references:     {len(image_references)}")

# Optionally: write results to a report file
with open("image_mapping_report.txt", "w") as f:
    for img, pages in sorted(mapped):
        f.write(f"[FOUND] {img} -> {', '.join(sorted(pages))}\n")
    for img, pages in sorted(missing):
        f.write(f"[MISSING] {img} -> {', '.join(sorted(pages))}\n")

help-recursive.py

import os
import re
import shutil
import requests
from pathlib import Path
from urllib.parse import urljoin, urlparse

# CONFIG
project_root = Path("/home/WEBSITENAME")  # your actual root
known_remote_prefixes = [
    "https://www.youtube.com", "https://fonts.googleapis.com", "https://fonts.gstatic.com"
]
fallback_dir = project_root / "_missing_assets"
fallback_dir.mkdir(exist_ok=True)

# TAGS and ATTRIBUTES to scan
asset_patterns = {
    "img": "src",
    "script": "src",
    "link": "href",
    "iframe": "src",
    "source": "src",
    "video": "src",
    "audio": "src",
    "object": "data",
}

# EXTENSIONS to look for
web_exts = [".html", ".php"]

# STORAGE
missing_assets = set()
symlinked = 0
downloaded = 0
placeheld = 0
existing = 0

def is_valid_url(link):
    return any(link.startswith(p) for p in known_remote_prefixes)

def extract_assets(html_text):
    assets = []
    for tag, attr in asset_patterns.items():
        matches = re.findall(fr'<{tag}[^>]*{attr}=["\'](.*?)["\']', html_text, re.IGNORECASE)
        assets.extend(matches)
    return assets

def try_find_local_match(filename):
    for path in project_root.rglob("*"):
        if path.name == filename and path.is_file():
            return path
    return None

def safe_download(url, dest):
    try:
        r = requests.get(url, timeout=5)
        r.raise_for_status()
        dest.write_bytes(r.content)
        return True
    except Exception as e:
        print(f"⚠ Failed to download {url} → {e}")
        return False

def create_placeholder(path):
    path.write_text("// placeholder file\n")

print("🔎 Scanning HTML and PHP files...\n")

for file in project_root.rglob("*"):
    if file.suffix.lower() not in web_exts or not file.is_file():
        continue

    html = file.read_text(errors="ignore")
    asset_paths = extract_assets(html)

    for ref in asset_paths:
        if not ref or ref.startswith("data:") or ref.startswith("#"):
            continue
        # Parse and normalize
        asset_path = urlparse(ref).path
        asset_path = asset_path.lstrip("/")  # always relative to web root
        asset_file = project_root / asset_path

        if asset_file.exists():
            existing += 1
            continue

        missing_assets.add(asset_path)
        asset_name = Path(asset_path).name
        asset_target_dir = project_root / os.path.dirname(asset_path)
        asset_target_dir.mkdir(parents=True, exist_ok=True)
        dest_path = asset_target_dir / asset_name

        # Try local match
        local = try_find_local_match(asset_name)
        if local:
            try:
                dest_path.symlink_to(local.resolve())
                print(f"🔗 Symlinked {dest_path} → {local}")
                symlinked += 1
                continue
            except Exception as e:
                print(f"❌ Symlink error: {e}")
        
        # Try remote download
        for remote_base in known_remote_prefixes:
            full_url = urljoin(remote_base + "/", asset_path)
            download_dest = fallback_dir / asset_name
            if safe_download(full_url, download_dest):
                try:
                    dest_path.symlink_to(download_dest.resolve())
                    print(f"🌐 Downloaded + Linked {dest_path} ← {full_url}")
                    downloaded += 1
                    break
                except Exception as e:
                    print(f"❌ Link fail after download: {e}")
                    continue
        else:
            # Final fallback: placeholder
            create_placeholder(dest_path)
            print(f"📄 Placeholder created for {dest_path}")
            placeheld += 1

print("\n✅ Done!")
print(f"✔ Existing assets found: {existing}")
print(f"🔗 Symlinks created: {symlinked}")
print(f"🌍 Files downloaded: {downloaded}")
print(f"📄 Placeholders created: {placeheld}")
print(f"❓ Total missing (unique): {len(missing_assets)}")

map_assets_symlinks.py

import os
import re
from pathlib import Path
from bs4 import BeautifulSoup

# Supported image/media extensions
MEDIA_EXTS = {'.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp', '.ico'}

# Directory where symlinks to all found media will be stored
SYMLINK_DEST = Path("public_assets")
SYMLINK_DEST.mkdir(exist_ok=True)

# Root directory to scan
ROOT_DIR = Path(".").resolve()

# Patterns to extract src/href/urls
SRC_HREF_REGEX = re.compile(r'''(?:src|href)\s*=\s*["']([^"']+)["']''')
URL_REGEX = re.compile(r'url\(["\']?([^"\')]+)["\']?\)')

def find_media_references(filepath):
    """Extract all image/media paths from a file."""
    refs = set()
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

        # Use BeautifulSoup for structured HTML
        if filepath.suffix in {".html", ".php"}:
            soup = BeautifulSoup(content, "html.parser")
            for tag in soup.find_all(["img", "link", "script"]):
                for attr in ["src", "href"]:
                    val = tag.get(attr)
                    if val and any(val.lower().endswith(ext) for ext in MEDIA_EXTS):
                        refs.add(val)
        # Also check CSS-style `url(...)` refs
        refs.update(URL_REGEX.findall(content))

    return refs

def normalize_ref(ref, base_path):
    """Resolve relative path refs to absolute ones."""
    ref = ref.split("?")[0].split("#")[0]  # Strip query/fragments
    ref_path = (base_path / ref).resolve()
    return ref_path if ref_path.exists() else None

def symlink_media(ref_path):
    """Create a symlink in the public assets folder if it doesn't exist."""
    try:
        if ref_path.suffix.lower() not in MEDIA_EXTS:
            return
        target = SYMLINK_DEST / ref_path.name
        if not target.exists():
            os.symlink(ref_path, target)
            print(f"✅ Linked: {target} → {ref_path}")
    except Exception as e:
        print(f"⚠️ Failed to link {ref_path}: {e}")

def main():
    for root, dirs, files in os.walk(ROOT_DIR):
        for file in files:
            if file.endswith((".html", ".php")):
                filepath = Path(root) / file
                refs = find_media_references(filepath)
                for ref in refs:
                    ref_path = normalize_ref(ref, filepath.parent)
                    if ref_path:
                        symlink_media(ref_path)

if __name__ == "__main__":
    main()

UNO REVERSE CARD (always better to make a backup as a tar or zip instead of this…)

repair-migrate.py

import os
import re
from pathlib import Path

# Root directory to scan & fix
ROOT_DIR = Path("/home/WEBSITENAME")  # change this to your root

# Extensions of pages to scan
PAGE_EXTS = {".html", ".php"}

# Image file extensions to recognize
IMG_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".tiff"}

# Regex pattern to find image refs in HTML and CSS styles (src="...", url(...))
IMG_REF_PATTERN = re.compile(
    r"""(?:
        src\s*=\s*["']([^"']+)["'] |      # src="..."
        url\(\s*['"]?([^"')]+)['"]?\s*\)  # url('...')
    )""",
    re.IGNORECASE | re.VERBOSE,
)

def find_all_images(root):
    """Build a dict: filename -> list of absolute image paths found on disk"""
    images = {}
    for img_path in root.rglob("*"):
        if img_path.is_file() and img_path.suffix.lower() in IMG_EXTS:
            name = img_path.name
            images.setdefault(name, []).append(img_path.resolve())
    return images

def make_relative_path(from_path, to_path):
    """Make a relative path from from_path parent to to_path"""
    return os.path.relpath(to_path, start=from_path.parent)

def fix_image_paths_in_file(file_path, images_on_disk):
    """Scan and fix image references in one HTML/PHP file."""
    content = file_path.read_text(encoding="utf-8", errors="ignore")
    changed = False

    def replacement(match):
        # Extract image URL from src or url()
        img_ref = match.group(1) or match.group(2)
        img_ref_clean = img_ref.split("?")[0].split("#")[0]  # Remove query/fragment
        img_name = os.path.basename(img_ref_clean)

        # Check if referenced path exists relative to page
        ref_path = (file_path.parent / img_ref_clean).resolve()
        if ref_path.exists():
            # Image found at current reference, no change needed
            return match.group(0)

        # Image file missing at current path, try to find it by filename
        candidates = images_on_disk.get(img_name)
        if not candidates:
            # No image found on disk by that name; keep original
            return match.group(0)

        # Choose the best candidate — simplest: first found
        real_path = candidates[0]
        rel_path = make_relative_path(file_path, real_path)

        # Replace old image ref with new relative path
        # Preserve the attribute (src= or url()) from original text
        if match.group(1):  # src="..."
            new_ref = f'src="{rel_path}"'
        else:  # url(...)
            new_ref = f'url("{rel_path}")'

        nonlocal changed
        changed = True
        return new_ref

    # Replace all image refs in file content
    new_content = IMG_REF_PATTERN.sub(replacement, content)

    if changed:
        print(f"Fixed image references in {file_path}")
        file_path.write_text(new_content, encoding="utf-8")

def main():
    print(f"Scanning {ROOT_DIR} for images...")
    images_on_disk = find_all_images(ROOT_DIR)
    print(f"Found {sum(len(v) for v in images_on_disk.values())} images on disk.")

    # Scan all pages and fix image refs
    for page_path in ROOT_DIR.rglob("*"):
        if page_path.suffix.lower() in PAGE_EXTS and page_path.is_file():
            fix_image_paths_in_file(page_path, images_on_disk)

    print("Done fixing image links!")

if __name__ == "__main__":
    main()

BONUS SCRIPT:

The following is a script I had custom tailored for my own website. You will need to infer quite a bit to adapt it to your own website. But it might be useful for someone. I’m mostly just putting this here for myself. That’s why it’s at the bottom..

remove-global16.py

import os
import base64
import mimetypes
from pathlib import Path
from urllib.parse import urljoin, quote, unquote
from bs4 import BeautifulSoup
import requests

# === Configuration ===
SCRIPT_DIR = Path(__file__).resolve().parent
INPUT_DIR = SCRIPT_DIR
OUTPUT_DIR = SCRIPT_DIR / "recompiled-pages"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = "https://josefkulovany.com/"

masthead_map = {
    "art": "Art _ Josef Kulovany.html",
    "construction": "Construction _ Josef Kulovany.html",
    "companies": "Companies _ Josef Kulovany.html",
    "extracurricular": "Extracurricular _ Josef Kulovany.html",
    "engineering-design": "Engineering _ Design _ Josef Kulovany.html",
    "art-engineering-economics-design-programming-construction": "Art, Engineering, Economics, Design, Programming, Construction _ Josef Kulovany.html",
    "latest-projects": "Latest Projects _ Josef Kulovany.htm"
}

def sanitize_filename(fn):
    return quote(fn, safe='').replace('%20', '_').replace('%', '_')

def encode_image(path: Path):
    try:
        mime, _ = mimetypes.guess_type(path.name)
        mime = mime or "application/octet-stream"
        with open(path, "rb") as f:
            encoded = base64.b64encode(f.read()).decode()
            return f"data:{mime};base64,{encoded}"
    except:
        return None

def fetch_remote_image(url):
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        mime, _ = mimetypes.guess_type(url)
        mime = mime or "application/octet-stream"
        encoded = base64.b64encode(r.content).decode()
        return f"data:{mime};base64,{encoded}"
    except:
        return None

def fetch_text(url: str) -> str:
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        return r.text
    except:
        return None

def process_html(file_path: Path):
    soup = BeautifulSoup(file_path.read_text(encoding='utf-8', errors='ignore'), "html.parser")
    base_path = file_path.parent

    if soup.head and not soup.head.find("base"):
        soup.head.insert(0, soup.new_tag("base", href="./"))

    # Inline styles
    for link in soup.find_all("link", rel="stylesheet", href=True):
        href = link['href']
        content = fetch_text(href) if href.startswith("http") else (base_path / href).read_text(encoding='utf-8', errors='ignore') if (base_path / href).exists() else None
        if content:
            style = soup.new_tag("style")
            style.string = content
            link.replace_with(style)

    # Inline JS
    for script in soup.find_all("script", src=True):
        src = script['src']
        content = fetch_text(src) if src.startswith("http") else (base_path / src).read_text(encoding='utf-8', errors='ignore') if (base_path / src).exists() else None
        if content:
            new = soup.new_tag("script")
            new.string = content
            script.replace_with(new)

    # Fix and wrap images
    for img in soup.find_all("img", src=True):
        src = img['src']
        decoded_src = unquote(src)
        local = None

        if not decoded_src.startswith(("http", "//")):
            if decoded_src.startswith("/"):
                local = (INPUT_DIR / decoded_src.lstrip('/')).resolve()
            else:
                local = (base_path / decoded_src).resolve()

        data_uri = encode_image(local) if local and local.exists() else fetch_remote_image(urljoin(BASE_URL, decoded_src))
        if data_uri:
            img['data-orig-src'] = src
            img['src'] = data_uri
            img.attrs.pop("srcset", None)
        else:
            img['src'] = urljoin(BASE_URL, decoded_src)

        if img.parent.name != "a":
            wrapper = soup.new_tag("a", href=img['data-orig-src'] if "data-orig-src" in img.attrs else img['src'])
            wrapper['class'] = 'zoomable'
            img.wrap(wrapper)

    # Lightbox script
    script_tag = soup.new_tag("script")
    script_tag.string = '''
document.addEventListener("click", function(e) {
  const link = e.target.closest("a.zoomable");
  if (!link) return;
  e.preventDefault();
  let existing = document.getElementById("zoom-overlay");
  if (existing) existing.remove();
  const overlay = document.createElement("div");
  overlay.id = "zoom-overlay";
  overlay.style = `
    position:fixed;top:0;left:0;width:100%;height:100%;
    background:rgba(0,0,0,0.8);
    display:flex;align-items:center;justify-content:center;
    cursor:pointer;
    z-index:10000;
  `;
  const img = document.createElement("img");
  img.src = link.href;
  img.style = "max-width:90%; max-height:90%; box-shadow: 0 0 10px black;";
  overlay.appendChild(img);
  overlay.addEventListener("click", () => overlay.remove());
  document.body.appendChild(overlay);
});
    '''
    (soup.body or soup).append(script_tag)

    # Update masthead links
    for a in soup.find_all("a", href=True):
        href = a['href']
        if href.startswith(BASE_URL):
            path = href[len(BASE_URL):].strip("/").lower()
            path = path.split('?')[0].split('#')[0]
            if path in masthead_map:
                a['href'] = masthead_map[path]

    return str(soup)

def recompile_all():
    for ext in ("*.html", "*.htm", "*.mhtml"):
        for src in INPUT_DIR.rglob(ext):
            if OUTPUT_DIR in src.parents:
                continue
            out_rel = src.relative_to(INPUT_DIR)
            out_path = OUTPUT_DIR / out_rel
            out_path.parent.mkdir(parents=True, exist_ok=True)
            html = process_html(src)
            out_path.write_text(html, encoding='utf-8')
            print("✓", out_rel)

if __name__ == "__main__":
    recompile_all()