#!/usr/bin/env bash
set -euo pipefail

base="${1:-/home/bowerybay/public_html/wilgus-images}"
sitemap="${2:-/home/bowerybay/public_html/vrp-sitemap.xml}"
ua='Mozilla/5.0'
tmpdir="$(mktemp -d)"
pages_file="$tmpdir/unit-pages.txt"
images_file="$tmpdir/image-urls.txt"

cleanup() {
  rm -rf "$tmpdir"
}
trap cleanup EXIT

mkdir -p "$base"

echo "Step 1: extracting unit URLs from sitemap..."
grep -oE '<loc>https://www\.wilgusassociates\.com/vrp/unit/[^<]+' "$sitemap" \
| sed 's#<loc>##' \
| sort -u > "$pages_file"

echo "Found $(wc -l < "$pages_file") unit pages"

echo "Step 2: extracting Escapia image URLs from unit pages..."

: > "$images_file"

while read -r page; do
  [ -z "$page" ] && continue
  echo "Reading: $page"

  curl -A "$ua" -L -s "$page" \
  | tr '"' '\n' \
  | sed 's#\\/#/#g' \
  | grep 'https://pictures.escapia.com/WILGUS/' \
  | grep -Ei '\.(jpg|jpeg|png|webp)($|[?])' \
  >> "$images_file" || true

done < "$pages_file"

sort -u "$images_file" -o "$images_file"

echo "Found $(wc -l < "$images_file") unique image URLs"

echo "Step 3: downloading images with mirrored directory structure..."

while read -r url; do
  [ -z "$url" ] && continue

  rel="${url#https://pictures.escapia.com/}"
  out="$base/$rel"
  mkdir -p "$(dirname "$out")"

  if [ -s "$out" ]; then
    echo "Skip: $out"
    continue
  fi

  tmpfile="${out}.tmp"

  if curl -A "$ua" -L --retry 3 --retry-delay 2 -s "$url" -o "$tmpfile"; then
    if [ -s "$tmpfile" ]; then
      mv "$tmpfile" "$out"
      echo "Saved: $out"
    else
      rm -f "$tmpfile"
      echo "Failed (empty): $url"
    fi
  else
    rm -f "$tmpfile"
    echo "Failed: $url"
  fi

done < "$images_file"

echo
echo "Done."
echo "Unit pages processed: $(wc -l < "$pages_file")"
echo "Unique image URLs found: $(wc -l < "$images_file")"
echo "Saved under: $base"