#!/usr/bin/env bash set -euo pipefail base="${1:-/home/bowerybay/public_html/wilgus-images}" sitemap="${2:-/home/bowerybay/public_html/vrp-sitemap.xml}" ua='Mozilla/5.0' tmpdir="$(mktemp -d)" pages_file="$tmpdir/unit-pages.txt" images_file="$tmpdir/image-urls.txt" cleanup() { rm -rf "$tmpdir" } trap cleanup EXIT mkdir -p "$base" echo "Step 1: extracting unit URLs from sitemap..." grep -oE 'https://www\.wilgusassociates\.com/vrp/unit/[^<]+' "$sitemap" \ | sed 's###' \ | sort -u > "$pages_file" echo "Found $(wc -l < "$pages_file") unit pages" echo "Step 2: extracting Escapia image URLs from unit pages..." : > "$images_file" while read -r page; do [ -z "$page" ] && continue echo "Reading: $page" curl -A "$ua" -L -s "$page" \ | tr '"' '\n' \ | sed 's#\\/#/#g' \ | grep 'https://pictures.escapia.com/WILGUS/' \ | grep -Ei '\.(jpg|jpeg|png|webp)($|[?])' \ >> "$images_file" || true done < "$pages_file" sort -u "$images_file" -o "$images_file" echo "Found $(wc -l < "$images_file") unique image URLs" echo "Step 3: downloading images with mirrored directory structure..." while read -r url; do [ -z "$url" ] && continue rel="${url#https://pictures.escapia.com/}" out="$base/$rel" mkdir -p "$(dirname "$out")" if [ -s "$out" ]; then echo "Skip: $out" continue fi tmpfile="${out}.tmp" if curl -A "$ua" -L --retry 3 --retry-delay 2 -s "$url" -o "$tmpfile"; then if [ -s "$tmpfile" ]; then mv "$tmpfile" "$out" echo "Saved: $out" else rm -f "$tmpfile" echo "Failed (empty): $url" fi else rm -f "$tmpfile" echo "Failed: $url" fi done < "$images_file" echo echo "Done." echo "Unit pages processed: $(wc -l < "$pages_file")" echo "Unique image URLs found: $(wc -l < "$images_file")" echo "Saved under: $base"